In [None]:
import json
import os
import pandas as pd
import numpy as np
#import ExplicitMF as mf

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.metrics.pairwise import linear_kernel

import matplotlib.pyplot as plt
import seaborn as sns
import random
from sklearn.utils import shuffle
import tensorflow as tf
from tqdm import tqdm
import re
import scipy
#from tensorflow import keras
from tensorflow.keras.layers import Input,Flatten, Embedding, Reshape, Multiply, Dropout, Dense, Concatenate, GlobalAveragePooling1D
from tensorflow.keras.layers import Layer, SpatialDropout1D, GlobalMaxPooling1D, Bidirectional, GRU
from tensorflow.keras.layers import Dot, TimeDistributed, BatchNormalization, multiply
from tensorflow.keras import initializers, regularizers, constraints
from tensorflow.keras import Model
from tensorflow.keras.optimizers import Adam
from sklearn.preprocessing import LabelEncoder
import collections
from collections import Counter
import math

In [None]:
# the methods traverse_dir(), load_data() and the dataset are provided by supervisor Peng Liu

def traverse_dir(rootDir, level=2):
    
    dir_list = []
    print(">>>",rootDir)
    for lists in os.listdir(rootDir):
        path = os.path.join(rootDir, lists)
        if level == 1:
            dir_list.append(path)
        else:
            if os.path.isdir(path):
                temp_list = traverse_dir(path, level)
                dir_list.extend(temp_list)
            else:
                dir_list.append(path)
    return dir_list

def load_data(rootpath, flist):
    """
        Load events from files and convert to dataframe.
    """
    map_lst = []
    for fname in flist:
        #fname = os.path.join(rootpath, f)
        for line in open(fname):
            obj = json.loads(line.strip())
            if not obj is None:
                map_lst.append(obj)
    return pd.DataFrame(map_lst)


In [None]:
fpath="./active1000/"
flist = traverse_dir(fpath)
df = load_data(fpath, flist)

# 1. Preprocessing

In [None]:
print(len(df))
df = df[df["documentId"].notna()]
print(len(df))

user_enc = LabelEncoder()
article_enc = LabelEncoder()
df["user_id"] = user_enc.fit_transform(df["userId"].values)
df["article_id"] = article_enc.fit_transform(df["documentId"].values)

In [None]:
df.head()

In [None]:
def take_one_category(text):
    """
    Convert hyphen to underline for the subcategories. So that Tfidf works correctly
    """
    try:
        cat = text.split("|")
        if len(cat) > 1:
            return cat[1]
        return cat
    except:
        return "null"
#behaviors = clean_title(behaviors)
df["category_cleaned"] = df["category"].apply(func = take_one_category)

In [None]:
category_enc = LabelEncoder()
df["category_int"] = category_enc.fit_transform(df["category_cleaned"].values)


In [None]:
def get_userid_to_profile(df):
    users = df["user_id"].unique()
    userid_to_profile = collections.defaultdict(list)
    for user_id in tqdm(users):
        user_subcat = df[df["user_id"] == user_id]["category_int"].values.tolist()
        counter = Counter(user_subcat)
        s = sorted(user_subcat, key=lambda x: (counter[x], x), reverse=True)
        final_subcategories = []
        for elem in s:
            if elem not in final_subcategories:
                final_subcategories.append(elem)
        while len(final_subcategories) < 6:
            final_subcategories.append(0)
        userid_to_profile[user_id] = final_subcategories[:6]
    return userid_to_profile
userid_to_profile = get_userid_to_profile(df)

In [None]:
df.head()

# 2. Train-test-split

In [None]:
SEED = 42
SAMPLE_SIZE = 99
NUM_NEGATIVES = 4
ALL_ARTICLE_IDS = df["article_id"].unique()
NUM_ARTICLES = len(ALL_ARTICLE_IDS)
ALL_USERS = df["user_id"].unique()
NUM_USERS = len(ALL_USERS)
NUM_CATEGORIES = len(df["category_int"].unique())

### Global dicts ###
    #userid_to_profile
    #userid_to_click_history

In [None]:
df["rank_latest"] = df.groupby(["user_id"])["time"].rank(method="first", ascending=False)

train_true = df[df['rank_latest'] != 1]
test_true = df[df['rank_latest'] == 1]

rating = [1 for i in range(len(train_true))]
train_true["label"] = rating

train = train_true[["user_id", "article_id", "label"]]
test = test_true[["user_id", "article_id"]]

In [None]:
def get_userid_to_click_history(df):
    userid_to_article_history = {}
    for user_id in tqdm(df["user_id"].unique()):
        click_history = df[df["user_id"] == user_id]["article_id"].values
        if len(click_history) < 10:
            while len(click_history) < 10:
                click_history = np.append(click_history, 0)
        if len(click_history) > 10:
            click_history = click_history[:10]
        userid_to_article_history[user_id] = click_history
    return userid_to_article_history
userid_to_click_history = get_userid_to_click_history(train_true)

In [None]:
def get_category(article_id, df=df):  
    return df[df["article_id"] == article_id]["category_int"].values[0]

In [None]:
df.head()

In [None]:
def get_items_interacted(user_id, df):
    interacted_items = df[df["user_id"]==user_id]["article_id"]
    return set(interacted_items if type(interacted_items) == pd.Series else [interacted_items])

def get_not_interacted(user_id, interactions_df=df):
    interacted_items = get_items_interacted(user_id, interactions_df)
    all_items = set(df["article_id"])
    not_interacted_items = all_items - interacted_items
    random.seed(SEED)
    not_interacted_items = random.sample(not_interacted_items, SAMPLE_SIZE)
    return not_interacted_items

In [None]:
article_to_category = df[["article_id", "category_int"]].set_index("article_id").to_dict()["category_int"]

In [None]:
def negative_sampling(train_df, user_id, article_id):
    """
    Negative sample training instance; for each positive instance, add 4 negative articles
    
    Return user_ids, news_ids, category_1, category_2, authors_onehotencoded, titles
    """
    
    users, articles, categories, click_history, profiles, labels = [], [], [], [], [], []
    user_item_set = set(zip(train_df[user_id].values, train_df[article_id].values))
    for (u,i) in user_item_set:
        for _ in range(NUM_NEGATIVES):
            negative_item = np.random.choice(ALL_ARTICLE_IDS)
            while (u, negative_item) in user_item_set:
                negative_item = np.random.choice(ALL_ARTICLE_IDS)
            users.append(u)
            articles.append(negative_item)
            categories.append(article_to_category[negative_item])
            click_history.append(userid_to_click_history[u])
            profiles.append(userid_to_profile[u])
            labels.append(0)
        users.append(u)
        articles.append(i)
        categories.append(article_to_category[i])
        click_history.append(userid_to_click_history[u])
        profiles.append(userid_to_profile[u])
        labels.append(1)
    
    users, articles,categories,click_history,profiles, labels = shuffle(users, articles,categories,click_history,profiles, labels, random_state=0)
    click_history = np.concatenate(click_history).reshape(-1, 10)
    profiles = np.concatenate(profiles).reshape(-1,6)
    return users, articles,categories,click_history,profiles, labels

train_users, train_articles,train_categories,train_click_history,train_profiles, train_labels = negative_sampling(train, "user_id", "article_id")

In [None]:
df_train = pd.DataFrame(list(zip(train_users, train_articles,train_categories,train_click_history,train_profiles, train_labels)),
                       columns=["user_id", "article_id", "category", "click_history", "user_profile", "label"])

# 4. Models

In [None]:
def getHitRatio(ranklist, gtItem):
    for item in ranklist:
        if item == gtItem:
            return 1
    return 0

def getNDCG(ranklist, gtItem):
    for i in range(len(ranklist)):
        item = ranklist[i]
        if item == gtItem:
            return math.log(2) / math.log(i+2)
    return 0

# 4.1 Neumf without features

In [None]:
num_users = NUM_USERS
num_items = NUM_ARTICLES
dims = 20
def get_model_neumf(num_users, num_items, dims, dense_layers=[128, 64, 32, 8]):
    user_input = Input(shape=(1,), name="user")
    item_input = Input(shape=(1,), name="item")
    
    mf_user_emb = Embedding(output_dim=dims, 
                            input_dim=num_users, 
                            input_length=1, 
                            embeddings_initializer='he_normal', 
                            embeddings_regularizer=regularizers.l2(0.001),
                            name="mf_user_emb")(user_input)
    mf_item_emb = Embedding(output_dim=dims, 
                            input_dim=num_items, 
                            input_length=1, 
                            embeddings_initializer='he_normal', 
                            embeddings_regularizer=regularizers.l2(0.001),
                            name="mf_item_emb")(item_input)
    
    num_layers = len(dense_layers)
    mlp_user_emb = Embedding(output_dim=int(dense_layers[0] / 2), 
                             input_dim=num_users, 
                             input_length=1, 
                             embeddings_initializer='he_normal', 
                             embeddings_regularizer=regularizers.l2(0.001),
                             name="mlp_user_emb")(user_input)
    mlp_item_emb = Embedding(output_dim=int(dense_layers[0] / 2), 
                             input_dim=num_items, 
                             input_length=1, 
                             embeddings_initializer='he_normal', 
                             embeddings_regularizer=regularizers.l2(0.001),
                             name="mlp_user_item")(item_input)
    
    # Matrix factorization
    mf_user_vecs = Reshape([dims])(mf_user_emb)
    mf_item_vecs = Reshape([dims])(mf_item_emb)
    
    mf_vec = multiply([mf_user_vecs, mf_item_vecs])
    
    #MLP
    mlp_vec = Concatenate()([mlp_user_emb, mlp_item_emb])
    mlp_vector = Flatten()(mlp_vec)
    
    for num_nodes in dense_layers:
        l = Dense(num_nodes, activation="relu")
        mlp_vector = l(mlp_vector)
    
    y = Concatenate()([mf_vec, mlp_vector])
    y = Dense(1, activation="sigmoid", name="pred")(y)
    
    
    model = Model(inputs=[user_input, item_input], outputs=y)
    model.compile(
        optimizer=Adam(0.01),
        loss="binary_crossentropy",
        metrics=["accuracy"],
    )
    return model

model_neumf = get_model_neumf(num_users, num_items, dims)

In [None]:
users_input, articles_input, labels_input = np.array(train_users).reshape(-1,1), np.array(train_articles).reshape(-1,1), np.array(train_labels).reshape(-1,1)

In [None]:
all_user_ids = train.index.unique().values

#user_input = df_train.iloc[:, 0].values.reshape((-1,1))
#profile_input = df_train.iloc[:, 1:6].values
#item_input = df_train.iloc[:, 7].values.reshape((-1,1))
#labels = df_train.iloc[:, 8].values.reshape((-1,1))


train_loss = []
val_loss = []
train_acc = []
val_acc = []

hits_list = []
ndcg_list = []
best_hits = 0
best_ndcgs = 0
best_hits_five = 0
best_ndcgs_five = 0

epochs=4
for epoch in range(epochs):
    hist = model_neumf.fit([users_input, articles_input], labels_input, epochs=1, shuffle=True, verbose=1, batch_size=1024)
    
    train_loss.append(hist.history["loss"])
    train_acc.append(hist.history["accuracy"])
    #val_loss.append(hist.history["val_loss"])
    #val_acc.append(hist.history["val_accuracy"])
    
    #hits, ndcgs, hits_five, ndcgs_five = evalaute_model_neumf( model_neumf, df_test, userid_to_true_item)
    #hits_list.append(np.average(hits))
    #ndcg_list.append(np.average(ndcgs))
    
    #temp_hits = np.average(hits)
    #temp_ndcgs = np.average(ndcgs)
    #if (temp_hits > best_hits):
    #    best_hits = temp_hits
    #    best_ndcgs = temp_ndcgs
    #    best_hits_five = np.average(hits_five)
    #    best_ndcgs_five = np.average(ndcgs_five)

In [None]:
test_users = test.user_id.values
test_items = test.article_id.values
test_users, test_items = shuffle(test_users, test_items)
test_users, test_items = test_users[:100], test_items[:100]
test_set = zip(test_users, test_items)
hits_ten,hits_five,ndcgs_ten,ndcgs_five = [], [], [], []
for (u,i) in tqdm(test_set):
    not_interacted_items = get_not_interacted(u)
    users = np.array([u]*100)
    items = np.array([i]+not_interacted_items)
    predictions = model_neumf.predict([users, items])
    predicted_labels = np.squeeze(predictions)
    top_ten_items = [items[k] for k in np.argsort(predicted_labels)[::-1][0:10].tolist()]
    
    hits_ten.append(getHitRatio(top_ten_items, i))
    hits_five.append(getHitRatio(top_ten_items[:5], i))
    ndcgs_ten.append(getNDCG(top_ten_items, i))
    ndcgs_five.append(getNDCG(top_ten_items[:5], i))
print(np.average(hits))

In [None]:
print("Hit @ 10: {:.2f}".format(np.average(hits_ten)))
print("ncdgs @ 10: {:.2f}".format(np.average(hits_five)))
print("Hit @ 10: {:.2f}".format(np.average(ndcgs_ten)))
print("ncdgs @ 10: {:.2f}".format(np.average(ndcgs_five)))

# 4.2 NCF

In [None]:
def get_model_ncf(num_users, num_items, dims, dense_layers=[128, 64, 32, 8]):
    user_input = Input(shape=(1,), name="user")
    item_input = Input(shape=(1,), name="item")
    
    user_emb = Embedding(output_dim=dims, 
                         input_dim=num_users, 
                         input_length=1, 
                         embeddings_initializer='he_normal', 
                         embeddings_regularizer=regularizers.l2(0.001),
                         name="mf_user_emb")(user_input)
    item_emb = Embedding(output_dim=dims, 
                         input_dim=num_items, 
                         input_length=1, 
                         embeddings_initializer='he_normal', 
                         embeddings_regularizer=regularizers.l2(0.001),
                         name="mf_item_emb")(item_input)
    
    user_vecs = Reshape([dims])(user_emb)
    item_vecs = Reshape([dims])(item_emb)
    
    y = Dot(1, normalize=False)([user_vecs, item_vecs])
    
    y = Dense(1, activation="sigmoid")(y)
    
    
    model = Model(inputs=[user_input, item_input], outputs=y)
    model.compile(
        optimizer=Adam(0.01),
        loss="mse",
        metrics=["accuracy"],
    )
    return model

model_ncf = get_model_ncf(num_users, num_items, dims)

In [None]:
users_input, articles_input, labels_input = np.array(train_users).reshape(-1,1), np.array(train_articles).reshape(-1,1), np.array(train_labels).reshape(-1,1)

In [None]:
all_user_ids = train.index.unique().values

#user_input = df_train.iloc[:, 0].values.reshape((-1,1))
#profile_input = df_train.iloc[:, 1:6].values
#item_input = df_train.iloc[:, 7].values.reshape((-1,1))
#labels = df_train.iloc[:, 8].values.reshape((-1,1))


train_loss = []
val_loss = []
train_acc = []
val_acc = []

hits_list = []
ndcg_list = []
best_hits = 0
best_ndcgs = 0
best_hits_five = 0
best_ndcgs_five = 0

epochs=4
for epoch in range(epochs):
    hist = model_ncf.fit([users_input, articles_input], labels_input, epochs=1, shuffle=True, verbose=1, batch_size=1024)
    
    train_loss.append(hist.history["loss"])
    train_acc.append(hist.history["accuracy"])
    #val_loss.append(hist.history["val_loss"])
    #val_acc.append(hist.history["val_accuracy"])
    
    #hits, ndcgs, hits_five, ndcgs_five = evalaute_model_neumf( model_neumf, df_test, userid_to_true_item)
    #hits_list.append(np.average(hits))
    #ndcg_list.append(np.average(ndcgs))
    
    #temp_hits = np.average(hits)
    #temp_ndcgs = np.average(ndcgs)
    #if (temp_hits > best_hits):
    #    best_hits = temp_hits
    #    best_ndcgs = temp_ndcgs
    #    best_hits_five = np.average(hits_five)
    #    best_ndcgs_five = np.average(ndcgs_five)

In [None]:
test_users = test.user_id.values
test_items = test.article_id.values
test_users, test_items = shuffle(test_users, test_items)
test_users, test_items = test_users[:100], test_items[:100]
test_set = zip(test_users, test_items)
hits_ten,hits_five,ndcgs_ten,ndcgs_five = [], [], [], []
for (u,i) in tqdm(test_set):
    not_interacted_items = get_not_interacted(u)
    users = np.array([u]*100)
    items = np.array([i]+not_interacted_items)
    predictions = model_neumf.predict([users, items])
    predicted_labels = np.squeeze(predictions)
    top_ten_items = [items[k] for k in np.argsort(predicted_labels)[::-1][0:10].tolist()]
    
    hits_ten.append(getHitRatio(top_ten_items, i))
    hits_five.append(getHitRatio(top_ten_items[:5], i))
    ndcgs_ten.append(getNDCG(top_ten_items, i))
    ndcgs_five.append(getNDCG(top_ten_items[:5], i))


In [None]:
print("Hit @ 10: {:.2f}".format(np.average(hits_ten)))
print("ncdgs @ 10: {:.2f}".format(np.average(hits_five)))
print("Hit @ 10: {:.2f}".format(np.average(ndcgs_ten)))
print("ncdgs @ 10: {:.2f}".format(np.average(ndcgs_five)))

# 4.3 Popularity based

In [None]:
most_popular_df = pd.DataFrame(df["article_id"].value_counts())
most_popular_df = most_popular_df.reset_index()
most_popular_df.columns=["article_id", "counts"]
most_popular_articles = most_popular_df["article_id"].values

In [None]:
def popularity_recommender(top_n, user_interactions, most_popular_articles,num_unique_users):
    """
    params: 
        top_n: number of articles to recommend
    """
    all_article_ids = df["article_id"].unique()
    recommendations = {}
    for (u,i) in tqdm(user_interactions.items()):
        interacted_items = user_interactions[u]
        popular_items_not_interacted_with = []
        for i in range(10):
            counter = i
            popular_item = most_popular_articles[i]
            while popular_item in interacted_items:
                counter += 1
                popular_item = most_popular_articles[counter]
            popular_items_not_interacted_with.append(popular_item)
        recommendations[u] = list(popular_items_not_interacted_with)
    return recommendations

user_interactions = df_train.groupby("user_id")["article_id"].apply(list).to_dict()
num_unique_users = len(df_train["user_id"].unique())
recs = popularity_recommender(10, user_interactions, most_popular_articles, num_unique_users)

In [None]:
test_users = test.user_id.values
test_items = test.article_id.values
test_users, test_items = test_users[:100], test_items[:100]
test_set = zip(test_users, test_items)
hits_ten,hits_five,ndcgs_ten,ndcgs_five = [], [], [], []
for (u,i) in tqdm(test_set):
    top_ten_items = recs[u]
    
    hits_ten.append(getHitRatio(top_ten_items, i))
    hits_five.append(getHitRatio(top_ten_items[:5], i))
    ndcgs_ten.append(getNDCG(top_ten_items, i))
    ndcgs_five.append(getNDCG(top_ten_items[:5], i))

In [None]:
print("Hit @ 10: {:.2f}".format(np.average(hits_ten)))
print("ncdgs @ 10: {:.2f}".format(np.average(hits_five)))
print("Hit @ 10: {:.2f}".format(np.average(ndcgs_ten)))
print("ncdgs @ 10: {:.2f}".format(np.average(ndcgs_five)))

# 4.4 Wide and deep with features

In [None]:
NUM_ARTICLES

In [None]:
def get_model_wide(num_users, num_items, dims, dense_layers=[128, 64, 32, 8]):
    #### Matrix factorization ####
    user_id_input = Input(shape=[1], name="user_id")
    item_id_input = Input(shape=[1], name="item_id")
    user_embedding = Embedding(input_dim=NUM_USERS, 
                               output_dim=dims, 
                               input_length=1, 
                               embeddings_initializer='he_normal', 
                               embeddings_regularizer=regularizers.l2(0.001),
                               name="user_embedding")(user_id_input)
    item_embedding = Embedding(input_dim=NUM_ARTICLES, 
                               output_dim=dims, 
                               embeddings_initializer='he_normal', 
                               embeddings_regularizer=regularizers.l2(0.001),
                               name="item_embedding")(item_id_input)
    
    user_flatten = Flatten()(user_embedding)
    item_flatten = Flatten()(item_embedding)
    mf_vec = Concatenate()([user_flatten, item_flatten])
    
    x_deep = Dense(128, activation="relu", kernel_initializer='he_uniform',kernel_regularizer=regularizers.l2(0.001))(mf_vec)
    x_deep = Dropout(0.2)(x_deep)
    x_deep = Dense(64, activation="relu",
                   kernel_initializer='he_uniform', 
                   kernel_regularizer=regularizers.l2(0.001))(x_deep)
    x_deep = Dropout(0.2)(x_deep)
    
    #### Wide part ####
    
    user_profile_input = Input(shape=(6,), name="user_profile")
    item_category_input = Input(shape=(1,), name="category_input")
    
    item_category_emb = Embedding(input_dim=NUM_CATEGORIES, output_dim=dims, name="category_emd", embeddings_regularizer=regularizers.l2(0.001))(item_category_input)
    user_profile_emb = Embedding(input_dim=NUM_CATEGORIES, output_dim=dims,
                                 embeddings_regularizer=regularizers.l2(0.001), name="profile_emb")(user_profile_input)

    item_category_flatten = Flatten()(item_category_emb)
    user_profile_flatten = Flatten()(user_profile_emb)
    
    wide_features = Concatenate()([item_category_flatten,  user_profile_flatten])
    
    x_wide = Dense(128, activation="relu",kernel_initializer='he_uniform', kernel_regularizer=regularizers.l2(0.001))(wide_features)
    x_wide = Dropout(0.5)(x_wide)
    x_wide = Dense(64, activation="relu",kernel_initializer='he_uniform', kernel_regularizer=regularizers.l2(0.001))(x_wide)
    x_wide = Dropout(0.5)(x_wide)
    
    final = Concatenate()([x_deep,x_wide])
    x = Dense(128, kernel_initializer='he_uniform',activation="relu")(final)
    x = Dropout(0.5)(x)
    y = Dense(1, activation="sigmoid")(x)
    
    
    model = Model(inputs=[user_id_input, user_profile_input, item_id_input, item_category_input], outputs=y)
    model.compile(
        optimizer=Adam(0.001),
        loss="binary_crossentropy",
        metrics=["accuracy"],
    )
    return model

model_wide = get_model_wide(num_users, num_items, dims)

In [None]:
users_input, articles_input, labels_input = np.array(train_users).reshape(-1,1), np.array(train_articles).reshape(-1,1), np.array(train_labels).reshape(-1,1)
categories_input = np.array(train_categories).reshape(-1,1)


In [None]:
#train_users, train_articles,train_categories,train_click_history,train_profiles, train_labels
#user_id_input, user_profile_input, item_id_input, item_category_input
train_loss = []
val_loss = []
train_acc = []
val_acc = []

hits_list = []
ndcg_list = []
best_hits = 0
best_ndcgs = 0
best_hits_five = 0
best_ndcgs_five = 0

epochs=4
for epoch in range(epochs):
    hist = model_wide.fit([users_input, train_profiles, articles_input, categories_input], labels_input, epochs=1, shuffle=True, verbose=1, batch_size=1024)
    
    train_loss.append(hist.history["loss"])
    train_acc.append(hist.history["accuracy"])

In [None]:
test_users = test.user_id.values
test_items = test.article_id.values
test_users, test_items = shuffle(test_users, test_items)
test_users, test_items = test_users[:100], test_items[:100]
test_set = zip(test_users, test_items)
hits_ten,hits_five,ndcgs_ten,ndcgs_five = [], [], [], []
for (u,i) in tqdm(test_set):
    not_interacted_items = get_not_interacted(u)
    users = np.array([u]*100)
    items = np.array([i]+not_interacted_items)
    profiles = np.tile(np.array(userid_to_profile[u]), 100).reshape(-1, 6)
    categories = np.tile(np.array(article_to_category[i]), 100).reshape(-1,1)
    
    predictions = model_wide.predict([users,profiles, items,categories])
    predicted_labels = np.squeeze(predictions)
    top_ten_items = [items[k] for k in np.argsort(predicted_labels)[::-1][0:10].tolist()]
    
    hits_ten.append(getHitRatio(top_ten_items, i))
    hits_five.append(getHitRatio(top_ten_items[:5], i))
    ndcgs_ten.append(getNDCG(top_ten_items, i))
    ndcgs_five.append(getNDCG(top_ten_items[:5], i))


In [None]:
print("Hit @ 10: {:.2f}".format(np.average(hits_ten)))
print("ncdgs @ 10: {:.2f}".format(np.average(hits_five)))
print("Hit @ 10: {:.2f}".format(np.average(ndcgs_ten)))
print("ncdgs @ 10: {:.2f}".format(np.average(ndcgs_five)))

# 4.5 NeuMF with features

In [1]:
def get_model_neumffeat(num_users, num_items, dims, dense_layers=[128, 64, 32, 8]):
    user_input = Input(shape=(1,), name="user")
    item_input = Input(shape=(1,), name="item")
    
    mf_user_emb = Embedding(output_dim=dims, 
                            input_dim=NUM_USERS, 
                            input_length=1, 
                            embeddings_initializer='he_normal', 
                            embeddings_regularizer=regularizers.l2(0.001),
                            name="mf_user_emb")(user_input)
    mf_item_emb = Embedding(output_dim=dims, 
                            input_dim=NUM_ARTICLES, 
                            input_length=1, 
                            embeddings_initializer='he_normal', 
                            embeddings_regularizer=regularizers.l2(0.001),
                            name="mf_item_emb")(item_input)
    
    num_layers = len(dense_layers)
    mlp_user_emb = Embedding(output_dim=int(dense_layers[0] / 2), 
                             input_dim=NUM_USERS, 
                             input_length=1, 
                             embeddings_initializer='he_normal', 
                             embeddings_regularizer=regularizers.l2(0.001),
                             name="mlp_user_emb")(user_input)
    mlp_item_emb = Embedding(output_dim=int(dense_layers[0] / 2), 
                             input_dim=NUM_ARTICLES, 
                             input_length=1, 
                             embeddings_initializer='he_normal', 
                             embeddings_regularizer=regularizers.l2(0.001),
                             name="mlp_user_item")(item_input)
    
    # Matrix factorization
    mf_user_vecs = Reshape([dims])(mf_user_emb)
    mf_item_vecs = Reshape([dims])(mf_item_emb)
    
    mf_vec = multiply([mf_user_vecs, mf_item_vecs])
    
    #MLP
    profile_input = Input(shape=(6,), name="user_profile")
    category_input = Input(shape=(1,), name="category_input")
    sub_category_input = Input(shape=(1,), name="subcategory_input")
    
    item_category_emb = Embedding(input_dim=NUM_CATEGORIES, 
                                  output_dim=int(dense_layers[0] / 2), 
                                  name="category_emd", 
                                  embeddings_regularizer=regularizers.l2(0.001))(category_input)
    user_profile_emb = Embedding(input_dim=NUM_CATEGORIES, 
                                 output_dim=int(dense_layers[0] / 2),
                                 embeddings_regularizer=regularizers.l2(0.001), 
                                 name="profile_emb")(profile_input)

    item_category_flatten = Flatten()(item_category_emb)
    user_profile_flatten = Flatten()(user_profile_emb)
    
    wide_features = Concatenate()([item_category_flatten,  user_profile_flatten])
    mlp_vector = Flatten()(wide_features)
    for num_dense in dense_layers:
        l = Dense(num_dense, activation="relu")
        mlp_vector = l(mlp_vector)
        mlp_vector = Dropout(0.2)(mlp_vector)
    

    
    mlp_vec = Concatenate()([mlp_user_emb, mlp_item_emb])
    mlp_vector = Flatten()(mlp_vec)
    
    y = Concatenate()([mf_vec, mlp_vector])
    y = Dense(1, activation="sigmoid", name="pred")(y)
    
    
    model = Model(inputs=[user_input, profile_input, item_input,category_input], outputs=y)
    model.compile(
        optimizer=Adam(0.01),
        loss="binary_crossentropy",
        metrics=["accuracy"],
    )
    return model
dims = 20
model_neumffeat = get_model_neumffeat(NUM_USERS, NUM_ARTICLES, dims)

NameError: name 'NUM_USERS' is not defined

In [2]:
users_input, articles_input, labels_input = np.array(train_users).reshape(-1,1), np.array(train_articles).reshape(-1,1), np.array(train_labels).reshape(-1,1)
categories_input = np.array(train_categories).reshape(-1,1)

NameError: name 'np' is not defined

In [None]:
#users_input, train_profiles, articles_input, categories_input
train_loss = []
val_loss = []
train_acc = []
val_acc = []

hits_list = []
ndcg_list = []
best_hits = 0
best_ndcgs = 0
best_hits_five = 0
best_ndcgs_five = 0

epochs=4
for epoch in range(epochs):
    hist = model_neumffeat.fit([users_input, train_profiles, articles_input, categories_input], labels_input, epochs=1, shuffle=True, verbose=1, batch_size=1024)
    
    train_loss.append(hist.history["loss"])
    train_acc.append(hist.history["accuracy"])

In [None]:
test_users = test.user_id.values
test_items = test.article_id.values
test_users, test_items = shuffle(test_users, test_items)
test_users, test_items = test_users[:100], test_items[:100]
test_set = zip(test_users, test_items)
hits_ten,hits_five,ndcgs_ten,ndcgs_five = [], [], [], []
for (u,i) in tqdm(test_set):
    not_interacted_items = get_not_interacted(u)
    users = np.array([u]*100)
    items = np.array([i]+not_interacted_items)
    profiles = np.tile(np.array(userid_to_profile[u]), 100).reshape(-1, 6)
    categories = np.tile(np.array(article_to_category[i]), 100).reshape(-1,1)
    
    predictions = model_neumffeat.predict([users,profiles, items,categories])
    predicted_labels = np.squeeze(predictions)
    top_ten_items = [items[k] for k in np.argsort(predicted_labels)[::-1][0:10].tolist()]
    
    hits_ten.append(getHitRatio(top_ten_items, i))
    hits_five.append(getHitRatio(top_ten_items[:5], i))
    ndcgs_ten.append(getNDCG(top_ten_items, i))
    ndcgs_five.append(getNDCG(top_ten_items[:5], i))


In [None]:
print("Hit @ 10: {:.2f}".format(np.average(hits_ten)))
print("ncdgs @ 10: {:.2f}".format(np.average(hits_five)))
print("Hit @ 10: {:.2f}".format(np.average(ndcgs_ten)))
print("ncdgs @ 10: {:.2f}".format(np.average(ndcgs_five)))