# Load Libs

In [0]:
%reload_ext autoreload
%autoreload 2

!pip install seaborn --upgrade

import os, sys, numpy as np, pandas as pd, tensorflow as tf
import seaborn as sns, keras
sns.set(style='white')

from collections import Counter, OrderedDict
from matplotlib import pyplot as plt

from sklearn.preprocessing import LabelEncoder, scale
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.metrics.pairwise import cosine_similarity

from keras import backend as K
from keras.models import Model
from keras.layers import Dense, Activation, Input, Dropout, Embedding, Flatten, Input
from keras.layers import dot, add, Lambda, Concatenate, multiply, BatchNormalization
from keras.optimizers import Adam, SGD, Adagrad
from keras import regularizers
from keras.callbacks import ModelCheckpoint
from keras.preprocessing.sequence import pad_sequences

np.set_printoptions(precision=4, suppress=True, linewidth=100)

# Data Preview

In [0]:
ratings = pd.read_csv('https://storage.googleapis.com/allianz-course/data/ratings.csv')
movies = pd.read_csv('https://storage.googleapis.com/allianz-course/data/movies.csv')

In [0]:
print(movies.shape)
movies.head()

In [0]:
print(ratings.shape)
ratings.head()

# Encode

In [0]:
# Fit user id and movie id
uid_enc, mid_enc = LabelEncoder(), LabelEncoder()
uid_enc.fit(ratings.userId)
mid_enc.fit(movies.movieId)

# Encode user id and movie id to indexed real value
ratings["userId"] = uid_enc.transform(ratings.userId)
ratings["movieId"] = mid_enc.transform(ratings.movieId)
movies["movieId"] = mid_enc.transform(movies.movieId)

# Dictionary of movie id and title
mid_map = pd.Series(dict(zip(movies.movieId, movies.title)))

# Number of users, number of movies
n_users, n_movies = len(uid_enc.classes_), len(mid_enc.classes_)

# Split Train, Test Data
* 以4分為閥值, 4分以上為positive, 未滿4分為negative
* 每個user分positive, negative兩部分, 各取30%到valid data

In [0]:
def split_ratings(data, pos_thres=4, test_ratio=0.3):
    """依照test_ratio切割movielens train test資料"""
    tr, te = [], []
    for u, df in data.groupby("userId"):
        if len(df) < 5: continue

        pos, neg = df.query("rating >= {}".format(pos_thres)), df.query("rating < {}".format(pos_thres))
        # Split positive part
        pos_len = int(len(pos) * (1 - test_ratio))
        tr_pos = pos[:pos_len]
        te_pos = pos[pos_len:]
        # Split negative part
        neg_len = int(len(neg) * (1 - test_ratio))
        tr_neg = neg[:neg_len]
        te_neg = neg[neg_len:]

        tr.append(tr_pos.append(tr_neg))
        te.append(te_pos.append(te_neg))
    return pd.concat(tr, ignore_index=True), pd.concat(te, ignore_index=True)

tr, te = split_ratings(ratings, 4, .3)

# Make Rating Matrix (Interaction Between Users and Movies)

In [0]:
tr_rating_mat = np.zeros((n_users, n_movies))
# Valid data rating matrix
te_rating_mat = np.zeros((n_users, n_movies))

# Train rating matrix
for idx, r in tr.iterrows():
    tr_rating_mat[int(r.userId), int(r.movieId)] = r.rating
# Valid rating matrix    
for idx, r in te.iterrows():
    te_rating_mat[int(r.userId), int(r.movieId)] = r.rating
    
print('Shape of train interaction matrix: ', tr_rating_mat.shape)
print(tr_rating_mat, '\n')
print('Shape of test interaction matrix: ', te_rating_mat.shape)
print(te_rating_mat)

# Encode Movies Table

In [0]:
def do_movies(movies):
    movies = movies.reset_index(drop=True)
    movies["genres"] = movies.genres.str.split("\|")
    genres_cnt = Counter()
    movies.genres.map(genres_cnt.update)
    genres_map = LabelEncoder()
    genres_map.fit( np.array(genres_cnt.most_common())[:, 0] )
    movies["genres"] = movies.genres.map(lambda lst: genres_map.transform(lst))
    
    movies["avg_rating"] = ratings.groupby("movieId").rating.mean()
    movies["avg_rating"] = scale(movies.avg_rating.fillna(movies.avg_rating.mean()))
    movies["freq_rating"] = ratings.groupby("movieId").size()
    movies["freq_rating"] = scale(movies.avg_rating.fillna(movies.freq_rating.median()))
    movies["year"] = movies.title.str.findall("\(\s*(\d+)\s*\)").map(lambda lst: int(lst[-1]) if len(lst) else None)
    movies["year"] = scale(movies.year.fillna(movies.year.mean()))

    return movies, genres_map

movies_encoded, genres_map = do_movies(movies)
n_genres = len(genres_map.classes_)
movies_encoded.head()

# Encode Users Statistics

In [0]:
# user_encoded 
user_encoded = ratings.groupby('userId').rating.agg(['size', 'mean'])
user_encoded.columns = ['user_rating_freq', 'user_rating_mean']
user_encoded['user_rating_freq'] = scale(user_encoded.user_rating_freq)
user_encoded['user_rating_mean'] = scale(user_encoded.user_rating_mean)
user_encoded = user_encoded.reset_index()
user_encoded.head()

# 以leave one out方式產生 train data, test data
1. 每一筆資料有兩部分: [user query] + [item id]
2. 每一筆user query 包含所有user movie history, 除了當前的rating movie (candidate movie)
3. test data的user query來自於train data

In [0]:
def loo_preprocess(data, movies_encoded, train_hist=None, is_train=True):
    """以leave one out方式產生 train data, test data"""
    queue = []
    data = data.merge(movies_encoded, how="left", on="movieId")
    data = data.merge(user_encoded, how="left", on="userId")
    columns = ["user_id", "query_movie_ids", "query_movie_ids_len", "user_rating_freq", "user_rating_mean",
               "genres", "genres_len", "avg_rating", "freq_rating", "year", "candidate_movie_id",
               "rating"]
    
    for u, df in data.groupby("userId"):
        df = df.sort_values("rating", ascending=False)
        # 抓出user給予正向評價的電影 (>= 4)
        if is_train:
            fav_movies = set(df.query("rating >= 4").movieId)
        else:
            fav_movies = set(train_hist.query(f"userId == {u} and rating >= 4").movieId)
        for i, (_, r) in enumerate(df.iterrows()):
            queries = list(fav_movies - set([int(r.movieId)]))
            # 對於multivalent的欄位, 需要增加一個column去描述該欄位的長度
            queue.append([int(r.userId),
                          queries,
                          len(queries),
                          r.user_rating_freq,
                          r.user_rating_mean,
                          r.genres,
                          len(r.genres),
                          r.avg_rating,
                          r.freq_rating,
                          r.year, 
                          int(r.movieId), 
                          r.rating])
    return pd.DataFrame(queue, columns=columns)

trProcessed = loo_preprocess(tr, movies_encoded)
teProcessed = loo_preprocess(te, movies_encoded, tr, is_train=False)
trProcessed.head()

In [0]:
teProcessed.head()

## Data Function
1. 由於 Keras(Tensorflow backend) 不支援變動長度的columns, 需透過padding zero(補零)帶入
2. 每個變動長度的column, 需要再給lens描述每一筆資料的長度, ex: query_movie_ids, query_movie_ids_len

In [0]:
feats = ["query_movie_ids", "query_movie_ids_len", "user_rating_freq", "user_rating_mean",
         "genres", "genres_len", "avg_rating", "freq_rating", "year", "candidate_movie_id", 'global']
multi_cols = ["query_movie_ids", 'genres']
label = 'rating'

# Generator function
def dataFn(data, n_batch=128, shuffle=False):
    pad = pad_sequences
    def fn():
        while True:
            dataInner = data.copy()
            indices = get_minibatches_idx(len(dataInner), n_batch, shuffle=shuffle)
            for ind in indices:
                ret = do_multi(dataInner.iloc[ind], multi_cols)
                ret['global'] = 0
                yield [np.stack(ret[col].values) if col in multi_cols else ret[col][:, None]
                       for col in feats], ret.rating.values[:, None]
    return fn

def get_minibatches_idx(n, batch_size, shuffle=False):
    idx_list = np.arange(n, dtype="int32")
    if shuffle:
        np.random.shuffle(idx_list)
    minibatches = []
    minibatch_start = 0
    for i in range(n // batch_size):
        minibatches.append(idx_list[minibatch_start : minibatch_start + batch_size])
        minibatch_start += batch_size

    if (minibatch_start != n):
        # Make a minibatch out of what is left
        minibatches.append(idx_list[minibatch_start:])
    return minibatches

def do_multi(df, multi_cols):
    """Padding the multivalent feature"""
    pad = pad_sequences
    df = df.copy()
    for colname in multi_cols:
        lens = df[colname].map(len)
        df[colname] = list(pad(df[colname], padding="post", maxlen=lens.max()))
    return df

for data, label in dataFn(trProcessed, n_batch=5, shuffle=True)():
    break

for name, col in zip(feats, data):
    print(f'{name}\n{col}\n')
print(f'label\n{label}')

In [0]:
tmp = pd.DataFrame()
for col, val in zip(feats, data):
    if col in multi_cols:
        tmp[col] = list(val)
    else:
        tmp[col] = val.ravel()

tmp['rating'] = label.ravel()
tmp

<br/>
<br/>
<br/>
<br/>
<br/>
<br/>

# Model of Matrix Factorization with DNN

## Build Model Function

In [0]:
def get_model(n_users, n_movies, emb_size, reg):
    # Input tesors
    inp_query = Input([None], dtype='int32', name='inp_query')
    inp_query_len = Input([1], dtype='int32', name='inp_query_len')
    inp_u_freq = Input([1], dtype='float32', name='inp_u_freq')
    inp_u_mean = Input([1], dtype='float32', name='inp_u_mean')
    inp_genres = Input([None], dtype='int32', name='inp_genres')
    inp_genres_len = Input([1], dtype='int32', name='inp_genres_len')
    inp_avg_rating = Input([1], dtype='float32', name='inp_avg_rating')
    inp_freq_rating = Input([1], dtype='float32', name='inp_freq_rating')
    inp_year = Input([1], dtype='float32', name='inp_year')
    inp_movie = Input([1], dtype='int32', name='inp_movie')
    # Hack: only input integer => "0"
    inp_global = Input([1], dtype='int32', name='inp_global')
    
    # User, movie, genres embedding
    emb_query = Embedding(n_movies, emb_size, embeddings_initializer='glorot_uniform', 
                          embeddings_regularizer=regularizers.l2(reg))(inp_query)
    emb_genres = Embedding(n_genres, 8, embeddings_initializer='glorot_uniform',
                          embeddings_regularizer=regularizers.l2(reg))(inp_genres)
    emb_movie = Embedding(n_movies, emb_size, embeddings_initializer='glorot_uniform',
                          embeddings_regularizer=regularizers.l2(reg))(inp_movie)
    
    # User side
    def sqrtn(x):
        qry, lens = x
        lens = tf.reshape(lens, [-1])
        weights = tf.nn.l2_normalize(tf.sequence_mask(lens, dtype=tf.float32), 1)
        weights = tf.expand_dims(weights, -1)
        return tf.reduce_sum(qry * weights, 1)
    emb_query = Lambda(sqrtn, name='emb_query')([emb_query, inp_query_len])
    emb_query = Concatenate(1)([emb_query, inp_u_freq, inp_u_mean])
    emb_query = Dense(emb_size, activation='relu', kernel_regularizer=regularizers.l2(reg))(emb_query)
    emb_query = Dense(emb_size, activation='relu', kernel_regularizer=regularizers.l2(reg), name='user_impression')(emb_query)
    
    # Movie side
    emb_genres = Lambda(sqrtn, name='emb_genres')([emb_genres, inp_genres_len])
    emb_movie = Flatten(name='emb_movie')(emb_movie)
    emb_movie = Concatenate(1)([emb_movie, emb_genres, inp_avg_rating, inp_freq_rating, inp_year])
    emb_movie = Dense(emb_size, activation='relu', kernel_regularizer=regularizers.l2(reg))(emb_movie)
    emb_movie = Dense(emb_size, activation='relu', kernel_regularizer=regularizers.l2(reg), name='movie_impression')(emb_movie)
    
    # Bias terms
    # Projection of emb_query to get bias
    b_user = Dense(1, 
                   kernel_initializer='glorot_uniform',
                   kernel_regularizer=regularizers.l2(reg),
                   activation='linear', 
                   use_bias=False,
                   name='b_user')(emb_query)
    # Projection of emb_movie to get bias
    b_movie = Dense(1, 
                   kernel_initializer='glorot_uniform',
                   kernel_regularizer=regularizers.l2(reg),
                   activation='linear', 
                   use_bias=False,
                   name='b_movie')(emb_movie)
    b_global = Flatten(name='b_global')(Embedding(1, 1, embeddings_initializer='glorot_uniform')(inp_global))
    
    # Implements the formulation
    nets = dot([emb_query, emb_movie], axes=1)
    nets = add([nets, b_user, b_movie, b_global])
    
    model = Model([inp_query, 
                   inp_query_len, 
                   inp_u_freq,
                   inp_u_mean,
                   inp_genres, 
                   inp_genres_len, 
                   inp_avg_rating,
                   inp_freq_rating,
                   inp_year,
                   inp_movie, 
                   inp_global], nets)
    model.summary()
    return model, Model([inp_movie, 
                         inp_genres, 
                         inp_genres_len, 
                         inp_avg_rating,
                         inp_freq_rating,
                         inp_year], emb_movie)

emb_size = 8
reg = 0.0005
batch_size = 128
epochs = 10

K.clear_session()
model_mf_dnn, model_emb_movie = get_model(n_users, n_movies, emb_size, reg)
model_mf_dnn.compile(optimizer=SGD(lr=0.05), loss='mse')

# Training

## Use Callback Function
* keras.callbacks.ModelCheckpoint: 只存檔最好的結果, 是另一種防止overfitting的方式
    * save_best_only = True

In [0]:
model_dir = "./model_mf_dnn"

tr_len = len(trProcessed)
te_len = len(teProcessed)
hist = model_mf_dnn.fit_generator(
    generator=dataFn(trProcessed, n_batch=batch_size, shuffle=True)(),
    steps_per_epoch=tr_len // batch_size + (1 if tr_len % batch_size else 0),
    validation_data=dataFn(teProcessed, n_batch=batch_size, shuffle=False)(),
    validation_steps=te_len // batch_size + (1 if te_len % batch_size else 0),
    # batch_size=batch_size,
    epochs=epochs,
    callbacks=[ModelCheckpoint(filepath=model_dir, 
                               save_weights_only=True, 
                               save_best_only=True)]
)

# After training, load the best weights back
model_mf_dnn.load_weights(model_dir)

sns.lineplot(np.arange(len(hist.history['loss'])), hist.history['loss'], label='train')
sns.lineplot(np.arange(len(hist.history['val_loss'])), hist.history['val_loss'], label='test')
plt.title('loss')
plt.grid(True)
plt.show()

# Prediction

In [0]:
te_len = len(teProcessed)
pred = model_mf_dnn.predict_generator(
    generator=dataFn(teProcessed, n_batch=batch_size, shuffle=False)(),
    steps=te_len // batch_size + (1 if te_len % batch_size else 0)
).ravel()
print('Shape of test data: ', pred.shape)

# Metrics
* 定義4分以上為正向評價, 4分以下為負向評價

## RMSE 

In [0]:
te_len = len(teProcessed)
valis_steps = te_len // batch_size + (1 if te_len % batch_size else 0)

te_y = []
for i, (feat, label) in enumerate(dataFn(teProcessed, 
                                         n_batch=batch_size, 
                                         shuffle=False)(), 1):
    if i > valis_steps: break
    te_y += label.ravel().tolist()
    
te_y = np.array(te_y)
print("RMSE: ", np.sqrt(np.mean((pred - te_y)**2)))

## AUC

In [0]:
def draw_roc_curve(y, pred_proba):
    fpr, tpr, _ = roc_curve(y, pred_proba, pos_label=1)
    auc_scr = auc(fpr, tpr)
    print("auc:", auc_scr)
    f, ax = plt.subplots(1, 1, figsize=(6, 6))

    ax.plot([0, 1], [0, 1], 'k--')
    ax.plot(fpr, tpr, label='ROC CURVE')
    ax.set_xlabel('False positive rate')
    ax.set_ylabel('True positive rate')
    ax.set_title('Area Under Curve(ROC) (score: {:.4f})'.format(auc_scr))
    ax.legend(loc='best')
    plt.grid(True)
    plt.show()
    
draw_roc_curve(te.rating >= 4, pred / pred.max())

## Single User Rating Histogram

In [0]:
# user id from 0 ~ 670
uid = 22
tmp = teProcessed.query(f"user_id == {uid}")
single_pred = model_mf_dnn.predict_generator(
    generator=dataFn(tmp, n_batch=batch_size, shuffle=False)(),
    steps=len(tmp) // batch_size + (1 if len(tmp) % batch_size else 0)
).ravel()

f, ax = plt.subplots(1, 2, figsize=(10, 5))
ax[0].set_title("pred distribute")
sns.distplot(single_pred, ax=ax[0])
ax[1].set_title("real distribute")
sns.distplot(te.query(f"userId == '{uid}'").rating, ax=ax[1])
plt.show()

## Single User Detail Table

In [0]:
# user id from 0 ~ 670
uid = 22
tmp = teProcessed.query(f"user_id == {uid}")
single_pred = model_mf_dnn.predict_generator(
    generator=dataFn(tmp, n_batch=batch_size, shuffle=False)(),
    steps=len(tmp) // batch_size + (1 if len(tmp) % batch_size else 0)
).ravel()

recommDf = pd.DataFrame(data={
              "userId": uid,
              "movieId": tmp.candidate_movie_id,
              "title": mid_map[tmp.candidate_movie_id].values,
              "rating": tmp.rating.values,
              "predRating": single_pred},
             columns=("userId", "movieId", "title", "rating", "predRating"))
# ascending 可以調整True or False觀察結果
recommDf.sort_values("rating", ascending=False)

In [0]:
recommDf.sort_values("predRating", ascending=False)

<br/>
<br/>
<br/>

# 利用 Movie Embedding 找出相似電影

In [0]:
movies[movies.title.str.contains("Toy")]

In [0]:
model_mf_dnn.load_weights(model_dir)

# Movie data function generator
movies_cols = ['movieId', 'genres', 'genres_len', 'avg_rating', 'freq_rating', 'year']
def movie_data_fn(data, batch_size=128):
    def _fn():
        data_inner = data.copy()
        while True:
            indices = get_minibatches_idx(len(data_inner), batch_size, shuffle=False)
            for ind in indices:
                ret = do_multi(data_inner.iloc[ind], ['genres'])
                ret['global'] = 0
                yield [np.stack(ret[col].values) if col in ['genres'] else ret[col][:, None]
                       for col in movies_cols]
    return _fn

def most_like(model, seed_movie, k=10):
    """給定某一部電影, 使用model裡movies embedding找尋cosine相似度高的其他電影!"""
    tmp = movies_encoded.copy()
    tmp['genres_len'] = tmp.genres.map(len)
    movie_emb = model.predict_generator(
        generator=movie_data_fn(tmp, batch_size)(),
        steps=len(tmp) // batch_size + (1 if len(tmp) % batch_size else 0)
    )
    # print(cosine_similarity(movie_emb[seed_movie][np.newaxis, :], movie_emb))
    most_like = cosine_similarity(movie_emb[seed_movie][np.newaxis, :], movie_emb).ravel().argsort()[::-1][:k]
    return movies.iloc[most_like]

# mse訓練出來的model
most_like(model_emb_movie, 7575, k=11)

<br/>
<br/>
<br/>

# (LAB) 將Model從Regression改為Classification

## Modify Data Generator

In [0]:
# Generator function
def dataFn(data, n_batch=128, shuffle=False):
    pad = pad_sequences
    def fn():
        while True:
            dataInner = data.copy()
            indices = get_minibatches_idx(len(dataInner), n_batch, shuffle=shuffle)
            for ind in indices:
                ret = do_multi(dataInner.iloc[ind], multi_cols)
                ret['global'] = 0
                yield [np.stack(ret[col].values) if col in multi_cols else ret[col][:, None]
                       for col in feats], (ret.rating >= 4).astype(int)[:, None]
    return fn

for data, label in dataFn(trProcessed, n_batch=5, shuffle=False)():
    break

print(f'label\n{label}')

In [0]:
def get_model(n_users, n_movies, emb_size, reg):
    # Input tesors
    inp_query = Input([None], dtype='int32', name='inp_query')
    inp_query_len = Input([1], dtype='int32', name='inp_query_len')
    inp_u_freq = Input([1], dtype='float32', name='inp_u_freq')
    inp_u_mean = Input([1], dtype='float32', name='inp_u_mean')
    inp_genres = Input([None], dtype='int32', name='inp_genres')
    inp_genres_len = Input([1], dtype='int32', name='inp_genres_len')
    inp_avg_rating = Input([1], dtype='float32', name='inp_avg_rating')
    inp_freq_rating = Input([1], dtype='float32', name='inp_freq_rating')
    inp_year = Input([1], dtype='float32', name='inp_year')
    inp_movie = Input([1], dtype='int32', name='inp_movie')
    # Hack: only input integer => "0"
    inp_global = Input([1], dtype='int32', name='inp_global')
    
    # User, movie, genres embedding
    emb_query = Embedding(n_movies, emb_size, embeddings_initializer='glorot_uniform', 
                          embeddings_regularizer=regularizers.l2(reg))(inp_query)
    emb_genres = Embedding(n_genres, 8, embeddings_initializer='glorot_uniform',
                          embeddings_regularizer=regularizers.l2(reg))(inp_genres)
    emb_movie = Embedding(n_movies, emb_size, embeddings_initializer='glorot_uniform',
                          embeddings_regularizer=regularizers.l2(reg))(inp_movie)
    
    # User side
    def sqrtn(x):
        qry, lens = x
        lens = tf.reshape(lens, [-1])
        weights = tf.nn.l2_normalize(tf.sequence_mask(lens, dtype=tf.float32), 1)
        weights = tf.expand_dims(weights, -1)
        return tf.reduce_sum(qry * weights, 1)
    emb_query = Lambda(sqrtn, name='emb_query')([emb_query, inp_query_len])
    emb_query = Concatenate(1)([emb_query, inp_u_freq, inp_u_mean])
    emb_query = Dense(emb_size, activation='relu', kernel_regularizer=regularizers.l2(reg))(emb_query)
    emb_query = Dense(emb_size, activation='relu', kernel_regularizer=regularizers.l2(reg), name='user_impression')(emb_query)
    
    # Movie side
    emb_genres = Lambda(sqrtn, name='emb_genres')([emb_genres, inp_genres_len])
    emb_movie = Flatten(name='emb_movie')(emb_movie)
    emb_movie = Concatenate(1)([emb_movie, emb_genres, inp_avg_rating, inp_freq_rating, inp_year])
    emb_movie = Dense(emb_size, activation='relu', kernel_regularizer=regularizers.l2(reg))(emb_movie)
    emb_movie = Dense(emb_size, activation='relu', kernel_regularizer=regularizers.l2(reg), name='movie_impression')(emb_movie)
    
    # Bias terms
    # Projection of emb_query to get bias
    b_user = Dense(1, 
                   kernel_initializer='glorot_uniform',
                   kernel_regularizer=regularizers.l2(reg),
                   activation='linear', 
                   use_bias=False,
                   name='b_user')(emb_query)
    # Projection of emb_movie to get bias
    b_movie = Dense(1, 
                   kernel_initializer='glorot_uniform',
                   kernel_regularizer=regularizers.l2(reg),
                   activation='linear', 
                   use_bias=False,
                   name='b_movie')(emb_movie)
    b_global = Flatten(name='b_global')(Embedding(1, 1, embeddings_initializer='glorot_uniform')(inp_global))
    
    # Implements the formulation
    nets = dot([emb_query, emb_movie], axes=1)
    nets = add([nets, b_user, b_movie, b_global])
    
    ###### START CODE HERE ######
    # Modify the model prediction to 0 ~ 1, hint: add an activation function
    # ...
    ###### END CODE HERE ######
    
    model = Model([inp_query, 
                   inp_query_len, 
                   inp_u_freq,
                   inp_u_mean,
                   inp_genres, 
                   inp_genres_len, 
                   inp_avg_rating,
                   inp_freq_rating,
                   inp_year,
                   inp_movie, 
                   inp_global], nets)
    model.summary()
    return model, Model([inp_movie, 
                         inp_genres, 
                         inp_genres_len, 
                         inp_avg_rating,
                         inp_freq_rating,
                         inp_year], emb_movie)

###### START CODE HERE ######
# Modify the hyper parameters to get even better result
emb_size = # 8, 10, 16 ...
reg = # 0.01, 0.005, 0.0005 ...
batch_size = 128
epochs =  # 10, 20 , 30 ...
lr = # 0.1, 0.05, 0.001
###### END CODE HERE ######

model_dir = "./model_mf_dnn"
K.clear_session()
model_mf_dnn, model_emb_movie = get_model(n_users, n_movies, emb_size, reg)

###### START CODE HERE ######
# Find best optimizer, e.g: Adam, SGD, Adagrad, find proper loss function
# model_mf_dnn.compile(...)
###### END CODE HERE ######

tr_len = len(trProcessed)
te_len = len(teProcessed)
hist = model_mf_dnn.fit_generator(
    generator=dataFn(trProcessed, n_batch=batch_size, shuffle=True)(),
    steps_per_epoch=tr_len // batch_size + (1 if tr_len % batch_size else 0),
    validation_data=dataFn(teProcessed, n_batch=batch_size, shuffle=False)(),
    validation_steps=te_len // batch_size + (1 if te_len % batch_size else 0),
    # batch_size=batch_size,
    epochs=epochs,
    callbacks=[ModelCheckpoint(filepath=model_dir, 
                               save_weights_only=True, 
                               save_best_only=True)]
)

# After training, load the best weights back
model_mf_dnn.load_weights(model_dir)

sns.lineplot(np.arange(len(hist.history['loss'])), hist.history['loss'], label='train')
sns.lineplot(np.arange(len(hist.history['val_loss'])), hist.history['val_loss'], label='test')
plt.title('loss')
plt.grid(True)
plt.show()

# Prediction
pred = model_mf_dnn.predict_generator(
    generator=dataFn(teProcessed, n_batch=batch_size, shuffle=False)(),
    steps=te_len // batch_size + (1 if te_len % batch_size else 0)
).ravel()


te_labels = (teProcessed.rating >= 4).astype(int)

# AUC
print('Shape of test data: ', pred.shape)
draw_roc_curve(te_labels, pred)

# Confusion matrix, classification report
from sklearn.metrics import confusion_matrix, classification_report
print('accuracy_score: ', accuracy_score(te_labels, pred >= 0.5))
print(confusion_matrix(te_labels, pred >= 0.5))
print()
print(classification_report(te_labels, pred >= 0.5))

most_like(model_emb_movie, 8787, k=11)

## 利用Movie Embedding, 以Cosine Similarity找出前10名相似電影

In [0]:
movies[movies.title.str.contains("Inception")]

In [0]:
# Call most_like function 找出前10名相似電影
# most_like(...)

<br/>
<br/>
<br/>
<br/>
<br/>
<br/>

## Solution

In [0]:
def get_model(n_users, n_movies, emb_size, reg):
    # Input tesors
    inp_query = Input([None], dtype='int32', name='inp_query')
    inp_query_len = Input([1], dtype='int32', name='inp_query_len')
    inp_u_freq = Input([1], dtype='float32', name='inp_u_freq')
    inp_u_mean = Input([1], dtype='float32', name='inp_u_mean')
    inp_genres = Input([None], dtype='int32', name='inp_genres')
    inp_genres_len = Input([1], dtype='int32', name='inp_genres_len')
    inp_avg_rating = Input([1], dtype='float32', name='inp_avg_rating')
    inp_freq_rating = Input([1], dtype='float32', name='inp_freq_rating')
    inp_year = Input([1], dtype='float32', name='inp_year')
    inp_movie = Input([1], dtype='int32', name='inp_movie')
    # Hack: only input integer => "0"
    inp_global = Input([1], dtype='int32', name='inp_global')
    
    # User, movie, genres embedding
    emb_query = Embedding(n_movies, emb_size, embeddings_initializer='glorot_uniform', 
                          embeddings_regularizer=regularizers.l2(reg))(inp_query)
    emb_genres = Embedding(n_genres, 8, embeddings_initializer='glorot_uniform',
                          embeddings_regularizer=regularizers.l2(reg))(inp_genres)
    emb_movie = Embedding(n_movies, emb_size, embeddings_initializer='glorot_uniform',
                          embeddings_regularizer=regularizers.l2(reg))(inp_movie)
    
    # User side
    def sqrtn(x):
        qry, lens = x
        lens = tf.reshape(lens, [-1])
        weights = tf.nn.l2_normalize(tf.sequence_mask(lens, dtype=tf.float32), 1)
        weights = tf.expand_dims(weights, -1)
        return tf.reduce_sum(qry * weights, 1)
    emb_query = Lambda(sqrtn, name='emb_query')([emb_query, inp_query_len])
    emb_query = Concatenate(1)([emb_query, inp_u_freq, inp_u_mean])
    emb_query = Dense(emb_size, activation='relu', kernel_regularizer=regularizers.l2(reg))(emb_query)
    emb_query = Dense(emb_size, activation='relu', kernel_regularizer=regularizers.l2(reg), name='user_impression')(emb_query)
    
    # Movie side
    emb_genres = Lambda(sqrtn, name='emb_genres')([emb_genres, inp_genres_len])
    emb_movie = Flatten(name='emb_movie')(emb_movie)
    emb_movie = Concatenate(1)([emb_movie, emb_genres, inp_avg_rating, inp_freq_rating, inp_year])
    emb_movie = Dense(emb_size, activation='relu', kernel_regularizer=regularizers.l2(reg))(emb_movie)
    emb_movie = Dense(emb_size, activation='relu', kernel_regularizer=regularizers.l2(reg), name='movie_impression')(emb_movie)
    
    # Bias terms
    # Projection of emb_query to get bias
    b_user = Dense(1, 
                   kernel_initializer='glorot_uniform',
                   kernel_regularizer=regularizers.l2(reg),
                   activation='linear', 
                   use_bias=False,
                   name='b_user')(emb_query)
    # Projection of emb_movie to get bias
    b_movie = Dense(1, 
                   kernel_initializer='glorot_uniform',
                   kernel_regularizer=regularizers.l2(reg),
                   activation='linear', 
                   use_bias=False,
                   name='b_movie')(emb_movie)
    b_global = Flatten(name='b_global')(Embedding(1, 1, embeddings_initializer='glorot_uniform')(inp_global))
    
    # Implements the formulation
    nets = dot([emb_query, emb_movie], axes=1)
    nets = add([nets, b_user, b_movie, b_global])
    
    ###### START CODE HERE ######
    # Modify the model prediction to 0 ~ 1, hint: add an activation function
    nets = Activation("sigmoid")(nets)
    ###### END CODE HERE ######
    
    model = Model([inp_query, 
                   inp_query_len, 
                   inp_u_freq,
                   inp_u_mean,
                   inp_genres, 
                   inp_genres_len, 
                   inp_avg_rating,
                   inp_freq_rating,
                   inp_year,
                   inp_movie, 
                   inp_global], nets)
    model.summary()
    return model, Model([inp_movie, 
                         inp_genres, 
                         inp_genres_len, 
                         inp_avg_rating,
                         inp_freq_rating,
                         inp_year], emb_movie)

###### START CODE HERE ######
# Modify the hyper parameters to get even better result
emb_size = 16
reg = 0.0005
batch_size = 128
epochs = 10
lr = 0.05
###### END CODE HERE ######

model_dir = "./model_mf_dnn"
K.clear_session()
model_mf_dnn, model_emb_movie = get_model(n_users, n_movies, emb_size, reg)

model_mf_dnn.compile(optimizer=SGD(lr=lr), loss='binary_crossentropy')

tr_len = len(trProcessed)
te_len = len(teProcessed)
hist = model_mf_dnn.fit_generator(
    generator=dataFn(trProcessed, n_batch=batch_size, shuffle=True)(),
    steps_per_epoch=tr_len // batch_size + (1 if tr_len % batch_size else 0),
    validation_data=dataFn(teProcessed, n_batch=batch_size, shuffle=False)(),
    validation_steps=te_len // batch_size + (1 if te_len % batch_size else 0),
    # batch_size=batch_size,
    epochs=epochs,
    callbacks=[ModelCheckpoint(filepath=model_dir, 
                               save_weights_only=True, 
                               save_best_only=True)]
)

# After training, load the best weights back
model_mf_dnn.load_weights(model_dir)

sns.lineplot(np.arange(len(hist.history['loss'])), hist.history['loss'], label='train')
sns.lineplot(np.arange(len(hist.history['val_loss'])), hist.history['val_loss'], label='test')
plt.title('loss')
plt.grid(True)
plt.show()

# Prediction
pred = model_mf_dnn.predict_generator(
    generator=dataFn(teProcessed, n_batch=batch_size, shuffle=False)(),
    steps=te_len // batch_size + (1 if te_len % batch_size else 0)
).ravel()

te_labels = (teProcessed.rating >= 4).astype(int)
# AUC
print('Shape of test data: ', pred.shape)
draw_roc_curve(te_labels, pred)

# Confusion matrix, classification report
print('accuracy_score: ', accuracy_score(te_labels, pred >= 0.5))
print(confusion_matrix(te_labels, pred >= 0.5))
print()
print(classification_report(te_labels, pred >= 0.5))

most_like(model_emb_movie, 0, k=11)

<br/>
<br/>
<br/>

# (LAB) 延續Classification, 以DNN作法取代MF作法

* Concatenate [user, movie], 且用 dense layer 增加hidden layers

In [0]:
def get_model(n_users, n_movies, emb_size, reg):
    # Input tesors
    inp_query = Input([None], dtype='int32', name='inp_query')
    inp_query_len = Input([1], dtype='int32', name='inp_query_len')
    inp_u_freq = Input([1], dtype='float32', name='inp_u_freq')
    inp_u_mean = Input([1], dtype='float32', name='inp_u_mean')
    inp_genres = Input([None], dtype='int32', name='inp_genres')
    inp_genres_len = Input([1], dtype='int32', name='inp_genres_len')
    inp_avg_rating = Input([1], dtype='float32', name='inp_avg_rating')
    inp_freq_rating = Input([1], dtype='float32', name='inp_freq_rating')
    inp_year = Input([1], dtype='float32', name='inp_year')
    inp_movie = Input([1], dtype='int32', name='inp_movie')
    # Hack: only input integer => "0"
    inp_global = Input([1], dtype='int32', name='inp_global')
    
    # User, movie, genres embedding
    emb_query = Embedding(n_movies, emb_size, embeddings_initializer='glorot_uniform', 
                          embeddings_regularizer=regularizers.l2(reg))(inp_query)
    emb_genres = Embedding(n_genres, 8, embeddings_initializer='glorot_uniform',
                          embeddings_regularizer=regularizers.l2(reg))(inp_genres)
    emb_movie = Embedding(n_movies, emb_size, embeddings_initializer='glorot_uniform',
                          embeddings_regularizer=regularizers.l2(reg))(inp_movie)
    
    # User side
    def sqrtn(x):
        qry, lens = x
        lens = tf.reshape(lens, [-1])
        weights = tf.nn.l2_normalize(tf.sequence_mask(lens, dtype=tf.float32), 1)
        weights = tf.expand_dims(weights, -1)
        return tf.reduce_sum(qry * weights, 1)
    emb_query = Lambda(sqrtn, name='emb_query')([emb_query, inp_query_len])
    emb_query = Concatenate(1)([emb_query, inp_u_freq, inp_u_mean])
    emb_query = Dense(emb_size, activation='relu')(emb_query)
    emb_query = Dense(emb_size, activation='relu', name='user_impression')(emb_query)
    
    # Movie side
    emb_genres = Lambda(sqrtn, name='emb_genres')([emb_genres, inp_genres_len])
    emb_movie = Flatten(name='emb_movie')(emb_movie)
    emb_movie = Concatenate(1)([emb_movie, emb_genres, inp_avg_rating, inp_freq_rating, inp_year])
    emb_movie = Dense(emb_size, activation='relu')(emb_movie)
    emb_movie = Dense(emb_size, activation='relu', name='movie_impression')(emb_movie)
    
    ###### START CODE HERE ######
    # Concatenate the user and movie embedding, add 2 hidden layers
    # Maybe you need Dropout! BatchNormalization, etc..., try it for the better result!
    # nets = Concatenate(1)([emb_query, emb_movie])
    # nets = Dense(...)(nets)
    # ...
    ###### END CODE HERE ######
    
    model = Model([inp_query, 
                   inp_query_len, 
                   inp_u_freq,
                   inp_u_mean,
                   inp_genres, 
                   inp_genres_len, 
                   inp_avg_rating,
                   inp_freq_rating,
                   inp_year,
                   inp_movie, 
                   inp_global], nets)
    model.summary()
    return model, Model([inp_movie, 
                         inp_genres, 
                         inp_genres_len, 
                         inp_avg_rating,
                         inp_freq_rating,
                         inp_year], emb_movie)

###### START CODE HERE ######
# Modify the hyper parameters to get even better result
emb_size = # 8, 10, 16 ...
reg = # 0.01, 0.005, 0.0005 ...
batch_size = 128
epochs =  # 10, 20 , 30 ...
lr = # 0.1, 0.05, 0.001
###### END CODE HERE ######

model_dir = "./model_mf_dnn"
K.clear_session()
model_mf_dnn, model_emb_movie = get_model(n_users, n_movies, emb_size, reg)

###### START CODE HERE ######
# Find best optimizer, e.g: Adam, SGD, Adagrad, find proper loss function
# model_mf.compile(...)
###### END CODE HERE ######

tr_len = len(trProcessed)
te_len = len(teProcessed)
hist = model_mf_dnn.fit_generator(
    generator=dataFn(trProcessed, n_batch=batch_size, shuffle=True)(),
    steps_per_epoch=tr_len // batch_size + (1 if tr_len % batch_size else 0),
    validation_data=dataFn(teProcessed, n_batch=batch_size, shuffle=False)(),
    validation_steps=te_len // batch_size + (1 if te_len % batch_size else 0),
    # batch_size=batch_size,
    epochs=epochs,
    callbacks=[ModelCheckpoint(filepath=model_dir, 
                               save_weights_only=True, 
                               save_best_only=True)]
)

# After training, load the best weights back
model_mf_dnn.load_weights(model_dir)

sns.lineplot(np.arange(len(hist.history['loss'])), hist.history['loss'], label='train')
sns.lineplot(np.arange(len(hist.history['val_loss'])), hist.history['val_loss'], label='test')
plt.title('loss')
plt.grid(True)
plt.show()

# Prediction
te_len = len(teProcessed)
pred = model_mf_dnn.predict_generator(
    generator=dataFn(teProcessed, n_batch=batch_size, shuffle=False)(),
    steps=te_len // batch_size + (1 if te_len % batch_size else 0)
).ravel()


te_labels = (teProcessed.rating >= 4).astype(int)
# AUC
print('Shape of test data: ', pred.shape)
draw_roc_curve(te_labels, pred)

# Confusion matrix, classification report
print('accuracy_score: ', accuracy_score(te_labels, pred >= 0.5))
print(confusion_matrix(te_labels, pred >= 0.5))
print()
print(classification_report(te_labels, pred >= 0.5))

most_like(model_emb_movie, 8787, k=11)

<br/>
<br/>
<br/>
<br/>
<br/>
<br/>

## Solution

In [0]:
def get_model(n_users, n_movies, emb_size, reg):
    # Input tesors
    inp_query = Input([None], dtype='int32', name='inp_query')
    inp_query_len = Input([1], dtype='int32', name='inp_query_len')
    inp_u_freq = Input([1], dtype='float32', name='inp_u_freq')
    inp_u_mean = Input([1], dtype='float32', name='inp_u_mean')
    inp_genres = Input([None], dtype='int32', name='inp_genres')
    inp_genres_len = Input([1], dtype='int32', name='inp_genres_len')
    inp_avg_rating = Input([1], dtype='float32', name='inp_avg_rating')
    inp_freq_rating = Input([1], dtype='float32', name='inp_freq_rating')
    inp_year = Input([1], dtype='float32', name='inp_year')
    inp_movie = Input([1], dtype='int32', name='inp_movie')
    # Hack: only input integer => "0"
    inp_global = Input([1], dtype='int32', name='inp_global')
    
    # User, movie, genres embedding
    emb_query = Embedding(n_movies, emb_size, embeddings_initializer='glorot_uniform', 
                          embeddings_regularizer=regularizers.l2(reg))(inp_query)
    emb_genres = Embedding(n_genres, 8, embeddings_initializer='glorot_uniform',
                          embeddings_regularizer=regularizers.l2(reg))(inp_genres)
    emb_movie = Embedding(n_movies, emb_size, embeddings_initializer='glorot_uniform',
                          embeddings_regularizer=regularizers.l2(reg))(inp_movie)
    
    # User side
    def sqrtn(x):
        qry, lens = x
        lens = tf.reshape(lens, [-1])
        weights = tf.nn.l2_normalize(tf.sequence_mask(lens, dtype=tf.float32), 1)
        weights = tf.expand_dims(weights, -1)
        return tf.reduce_sum(qry * weights, 1)
    emb_query = Lambda(sqrtn, name='emb_query')([emb_query, inp_query_len])
    emb_query = Concatenate(1)([emb_query, inp_u_freq, inp_u_mean])
    emb_query = Dense(emb_size, activation='relu')(emb_query)
    emb_query = Dense(emb_size, activation='relu', name='user_impression')(emb_query)
    
    # Movie side
    emb_genres = Lambda(sqrtn, name='emb_genres')([emb_genres, inp_genres_len])
    emb_movie = Flatten(name='emb_movie')(emb_movie)
    emb_movie = Concatenate(1)([emb_movie, emb_genres, inp_avg_rating, inp_freq_rating, inp_year])
    emb_movie = Dense(emb_size, activation='relu')(emb_movie)
    emb_movie = Dense(emb_size, activation='relu', name='movie_impression')(emb_movie)
    
    ###### START CODE HERE ######
    # Concatenate the user and movie embedding, add 2 hidden layers
    # Maybe you need Dropout! BatchNormalization, etc..., try it for the better result!
    nets = Concatenate(1)([emb_query, emb_movie])
    nets = Dense(32, activation='linear')(nets)
    nets = Activation('relu')(BatchNormalization()(nets))
    nets = Dense(16, activation='linear')(nets)
    nets = Activation('relu')(BatchNormalization()(nets))
    nets = Dense(1, activation='sigmoid')(nets)
    ###### END CODE HERE ######
    
    model = Model([inp_query, 
                   inp_query_len, 
                   inp_u_freq,
                   inp_u_mean,
                   inp_genres, 
                   inp_genres_len, 
                   inp_avg_rating,
                   inp_freq_rating,
                   inp_year,
                   inp_movie, 
                   inp_global], nets)
    model.summary()
    return model, Model([inp_movie, 
                         inp_genres, 
                         inp_genres_len, 
                         inp_avg_rating,
                         inp_freq_rating,
                         inp_year], emb_movie)

###### START CODE HERE ######
# Modify the hyper parameters to get even better result
emb_size = 16
reg = 0.0005
batch_size = 128
epochs = 10
lr = 0.05
###### END CODE HERE ######

model_dir = "./model_mf_dnn"
K.clear_session()
model_mf_dnn, model_emb_movie = get_model(n_users, n_movies, emb_size, reg)

model_mf_dnn.compile(optimizer=SGD(lr=lr), loss='binary_crossentropy')

tr_len = len(trProcessed)
te_len = len(teProcessed)
hist = model_mf_dnn.fit_generator(
    generator=dataFn(trProcessed, n_batch=batch_size, shuffle=True)(),
    steps_per_epoch=tr_len // batch_size + (1 if tr_len % batch_size else 0),
    validation_data=dataFn(teProcessed, n_batch=batch_size, shuffle=False)(),
    validation_steps=te_len // batch_size + (1 if te_len % batch_size else 0),
    # batch_size=batch_size,
    epochs=epochs,
    callbacks=[ModelCheckpoint(filepath=model_dir, 
                               save_weights_only=True, 
                               save_best_only=True)]
)

# After training, load the best weights back
model_mf_dnn.load_weights(model_dir)

sns.lineplot(np.arange(len(hist.history['loss'])), hist.history['loss'], label='train')
sns.lineplot(np.arange(len(hist.history['val_loss'])), hist.history['val_loss'], label='test')
plt.title('loss')
plt.grid(True)
plt.show()

# Prediction
te_len = len(teProcessed)
pred = model_mf_dnn.predict_generator(
    generator=dataFn(teProcessed, n_batch=batch_size, shuffle=False)(),
    steps=te_len // batch_size + (1 if te_len % batch_size else 0)
).ravel()


te_labels = (teProcessed.rating >= 4).astype(int)
# AUC
print('Shape of test data: ', pred.shape)
draw_roc_curve(te_labels, pred)

# Confusion matrix, classification report
print('accuracy_score: ', accuracy_score(te_labels, pred >= 0.5))
print(confusion_matrix(te_labels, pred >= 0.5))
print()
print(classification_report(te_labels, pred >= 0.5))

most_like(model_emb_movie, 0, k=11)