# Neural Collaborative Filtering

![](https://github.com/CloudMile/ml-specialized/blob/master/kkbox/kkbox_model_structure.jpg?raw=true)

### GMF: Generalized Matrix Factorization
*  線性 model 學習 user x movie 的 **latent feature interactions**


### MLP: Dense layer with ReLU activation function
*  非線性 model 學習 user x movie 的 **function**



# Load Libs

In [0]:
%reload_ext autoreload
%autoreload 2

!pip install seaborn --upgrade

import os, sys, numpy as np, pandas as pd, tensorflow as tf
import seaborn as sns, keras
sns.set(style='white')

from collections import Counter
from sklearn.preprocessing import LabelEncoder
from matplotlib import pyplot as plt

from sklearn.metrics import roc_curve, auc
from keras import backend as K
from keras.models import Model
from keras.layers import Dense, BatchNormalization, Activation, Input, Dropout, Embedding, Flatten, Input
from keras.layers import Concatenate
from keras.optimizers import Adagrad, SGD, Adam
from keras import regularizers
from keras.layers import dot, add, Layer
from keras.callbacks import ModelCheckpoint

np.set_printoptions(precision=4, suppress=True, linewidth=100)

# Read Data and Preview

In [0]:
ratings = pd.read_csv('https://storage.googleapis.com/allianz-course/data/ratings.csv')
movies = pd.read_csv('https://storage.googleapis.com/allianz-course/data/movies.csv')

# ...

# Encode Data

* 使用sklearn的LabelEncoder transform userId, movieId
* 紀錄user數量與movie數量 ==> `n_users, n_movies`

In [0]:
uid_enc, mid_enc = LabelEncoder(), LabelEncoder()
# ...

# Encode user id and movie id to indexed real value
# ...


# Number of users, number of movies
# ...

# Split Train, Test Data
* 以4分為閥值, 4分以上為positive, 未滿4分為negative
* 每個user分positive, negative兩部分, 各取30%到valid data

In [0]:
def split_ratings(data, pos_thres=4, test_ratio=0.3):
    """依照test_ratio切割movielens train test資料"""
    tr, te = [], []
    for u, df in data.groupby("userId"):
        if len(df) < 5: continue

        pos, neg = df.query("rating >= {}".format(pos_thres)), df.query("rating < {}".format(pos_thres))
        # Split positive part
        pos_len = int(len(pos) * (1 - test_ratio))
        tr_pos = pos[:pos_len]
        te_pos = pos[pos_len:]
        # Split negative part
        neg_len = int(len(neg) * (1 - test_ratio))
        tr_neg = neg[:neg_len]
        te_neg = neg[neg_len:]

        tr.append(tr_pos.append(tr_neg))
        te.append(te_pos.append(te_neg))
    return pd.concat(tr, ignore_index=True), pd.concat(te, ignore_index=True)

# ...

# 修改Label 為 0, 1 (rating >= 4: 1,  rating < 4: 0)

* 並且修正為可帶入Keras的資料格式 
> `[user[:, None], movie[:, None], label[:, None]]`

# Model of Neural Collaborative Filtering



## Build Model Function

* 需要**4個Embedding**
* 不需要**3個Bias**

In [0]:
def get_model(n_users, n_movies, emb_size, reg):
    # Input tesors
    inp_user = Input([1], dtype='int32')
    inp_movie = Input([1], dtype='int32')
    
    # User, movie embedding
    # emb_user = Embedding(n_users, emb_size, embeddings_initializer='glorot_uniform',
    #                      embeddings_regularizer=regularizers.l2(reg), name='emb_user', in)(inp_user)
    # emb_user = Flatten()(emb_user)
    # ...
    
    # Implements the formulation
    # ...
    
    # Input: [user, movie, zero]
    model = Model([inp_user, inp_movie], nets)
    model.summary()
    return model


# emb_size = ...
# reg = ...
# batch_size = 128
# epochs = 20

# K.clear_session()
# model = get_model(n_users, n_movies, emb_size, reg)
# model.compile(...)

# Training

In [0]:
model_dir = ...
hist = model.fit(x=...,
                    y=...,
                    validation_data=...,
                    batch_size=...,
                    epochs=...,
                    callbacks=[ModelCheckpoint(filepath=model_dir, 
                                               save_weights_only=True, 
                                               save_best_only=True)])

# After training, load the best weights back
model.load_weights(model_dir)

sns.lineplot(np.arange(len(hist.history['loss'])), hist.history['loss'], label='train')
sns.lineplot(np.arange(len(hist.history['val_loss'])), hist.history['val_loss'], label='test')
plt.title('loss')
plt.grid(True)
plt.show()

# Prediction

In [0]:
pred = model.predict(te_x).ravel()
print('Shape of test data: ', pred.shape)

# AUC

In [0]:
def draw_roc_curve(y, pred_proba):
    fpr, tpr, _ = roc_curve(y, pred_proba, pos_label=1)
    auc_scr = auc(fpr, tpr)
    print("auc:", auc_scr)
    f, ax = plt.subplots(1, 1, figsize=(6, 6))

    ax.plot([0, 1], [0, 1], 'k--')
    ax.plot(fpr, tpr, label='ROC CURVE')
    ax.set_xlabel('False positive rate')
    ax.set_ylabel('True positive rate')
    ax.set_title('Area Under Curve(ROC) (score: {:.4f})'.format(auc_scr))
    ax.legend(loc='best')
    plt.grid(True)
    plt.show()
    
# draw_roc_curve(...)

# Accuracy Confusion Matrix, and Classification Report

* 閥值設定在0.5, **當然也可以用F beta score找最佳閥值**

In [0]:
from sklearn.metrics import accuracy_score, confusion_matrix

# print('accuracy_score: ', ...)
# print(confusion_matrix(...))
# print()
# print(classification_report(...))