In [None]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import tensorflow as tf
from tqdm import tqdm
import re
import scipy
#from tensorflow import keras
from tensorflow.keras.layers import Input,Flatten, Embedding, Reshape, Multiply, Dropout, Dense, Concatenate, GlobalAveragePooling1D
from tensorflow.keras.layers import Layer, SpatialDropout1D, GlobalMaxPooling1D, Bidirectional, GRU
from tensorflow.keras.layers import Dot, TimeDistributed, BatchNormalization, multiply
from tensorflow.keras import initializers, regularizers, constraints
from tensorflow.keras import Model
from tensorflow.keras.optimizers import Adam
#import keras.backend as K
from sklearn.utils import shuffle
import seaborn as sns
import math
import pickle
import collections
from collections import Counter

In [None]:
PATH = "../data/addressa/"
with open(PATH + "articles.bin", "rb") as f_in:
    articles = pickle.load(f_in)
# two different files: behaviors.bin and behaviors_two_days.bin
with open(PATH + "behaviors_7_days.bin", "rb") as f_in:
    full_behaviors = pickle.load(f_in)

In [None]:
# NB! REMOVE THIS (CAN STILL SHUFFLE IT THOUGH)
full_behaviors= full_behaviors.sort_values(by=["user"])
#behaviors = behaviors.sample(frac=1).reset_index(drop=True)[:10000]

In [None]:
full_behaviors = full_behaviors[:150000]

# 1. Preprocessing

In [None]:
full_behaviors["time"] = pd.to_datetime(full_behaviors["time"], unit="s")
full_behaviors.drop(columns=["articleId"], inplace=True)
full_behaviors = full_behaviors.drop_duplicates(["user", "id"])
print("before merge: ",len(full_behaviors))
full_behaviors = full_behaviors.drop(columns=["title", "author"])
articles.rename(columns={"article_id": "id"}, inplace=True)
full_behaviors = full_behaviors.merge(articles, on=["id"])
print("after merge:",len(full_behaviors))

print("Len before removal: ",len(full_behaviors))
behaviors = full_behaviors[full_behaviors.groupby('user').user.transform('count')>2].copy()
print("Len after removal: ",len(behaviors))


user_enc = LabelEncoder()
article_enc = LabelEncoder()
behaviors["user_id"] = user_enc.fit_transform(behaviors["user"].values)
behaviors["article_id"] = article_enc.fit_transform(behaviors["id"].values)





In [None]:
import nltk
from nltk.corpus import stopwords
# Helper functions
def _removeNonAscii(s):
    return "".join(i for i in s if  ord(i)<128)

def make_lower_case(text):
    return text.lower()

def remove_stop_words(text):
    text = text.split()
    stops = set(stopwords.words("norwegian"))
    text = [w for w in text if not w in stops]
    text = " ".join(text)
    return text

def remove_html(text):
    html_pattern = re.compile('<.*?>')
    return html_pattern.sub(r'', text)

def remove_punctuation(text):
    text = re.sub(r'[^\w\s]', '', text)
    return text

def text_to_list(text):
    text = text.split(" ")
    return text

def take_one_category(text):
    temp = text.split()
    if len(temp) > 1:
        return temp[1]
    return temp[0]

In [None]:
def clean_title(df):
    df["title_cleaned"] = df.title.apply(func = make_lower_case)
    df["title_cleaned"] = df.title_cleaned.apply(func = remove_stop_words)
    df["title_cleaned"] = df.title_cleaned.apply(func = remove_punctuation)
    return df
def hyphen_to_underline(category):
    """
    Convert hyphen to underline for the subcategories. So that Tfidf works correctly
    """
    return category.replace("-","_")
behaviors = clean_title(behaviors)
behaviors["category_cleaned"] = behaviors["kw_category"].apply(func = take_one_category)

In [None]:
behaviors.head(1)

In [None]:
category_enc = LabelEncoder()
subcategory_enc = LabelEncoder()
behaviors["category_int"] = subcategory_enc.fit_transform(behaviors["category_cleaned"].values)


In [None]:
len(behaviors["category_int"].unique())

In [None]:
len(behaviors["user"].unique())

In [None]:
len(behaviors["article_id"].unique())

In [None]:
#sns.histplot(behaviors["article_id"])

In [None]:
users = behaviors["user_id"].unique()
userid_to_profile = collections.defaultdict(list)
for user_id in tqdm(users):
    user_subcat = behaviors[behaviors["user_id"] == user_id]["category_int"].values.tolist()
    counter = Counter(user_subcat)
    s = sorted(user_subcat, key=lambda x: (counter[x], x), reverse=True)
    final_subcategories = []
    for elem in s:
        if elem not in final_subcategories:
            final_subcategories.append(elem)
    while len(final_subcategories) < 6:
        final_subcategories.append(0)
    userid_to_profile[user_id] = final_subcategories[:6]

In [None]:
profile_df = pd.DataFrame.from_dict(userid_to_profile, orient="index")
profile_df["user_id"] = profile_df.index
behaviors = behaviors.merge(profile_df, on="user_id")
behaviors = behaviors.rename(columns={"0": "p0","1": "p1","2": "p2","3": "p3","4": "p4","5": "p5",})

article_id_to_category_int = behaviors[["article_id", "category_int"]].set_index("article_id").to_dict()
article_id_to_category_int = article_id_to_category_int["category_int"]

behaviors.head(1)

In [None]:
#behaviors.user_id.value_counts()

In [None]:
#behaviors.head()

# 2. Train test spliit


In [None]:
behaviors["rank_latest"] = behaviors.groupby(["user_id"])["time"].rank(method="first", ascending=False)

train_true = behaviors[behaviors['rank_latest'] != 1]
test_true = behaviors[behaviors['rank_latest'] == 1]

rating = [1 for i in range(len(train_true))]
train_true = train_true.assign(e=pd.Series(rating))
#train_true.loc[-1,"label"] = rating

In [None]:
def get_userid_to_article_history(df):
    userid_to_article_history = {}
    for user_id in tqdm(df["user_id"].unique()):
        click_history = df[df["user_id"] == user_id]["article_id"].values
        if len(click_history) < 10:
            while len(click_history) < 10:
                click_history = np.append(click_history, 0)
        if len(click_history) > 10:
            click_history = click_history[:10]
        userid_to_article_history[user_id] = click_history
    return userid_to_article_history
userid_to_article_history = get_userid_to_article_history(train_true)

In [None]:
print(len(userid_to_article_history))
print(len(behaviors["user_id"].unique()))

In [None]:
all_article_ids = behaviors["article_id"].unique()

def negative_sampling(train_df, all_article_ids, user_id, article_id):
    """
    Negative sample training instance; for each positive instance, add 4 negative articles
    
    Return user_ids, news_ids, category_1, category_2, authors_onehotencoded, titles
    """
    
    user_ids, user_click_history, articles, article_category, labels = [], [], [], [], []
    p0, p1, p2, p3, p4, p5, p6, p7, p8, p9 = [], [], [], [], [], [], [], [], [], []
    user_item_set = set(zip(train_df[user_id], 
                            train_df[article_id]))
    num_negatives = 4

    for (u, i) in tqdm(user_item_set):
        user_ids.append(u)
        user_click_history.append(userid_to_article_history[u])
        profile = np.array(userid_to_profile[u])
        p0.append(profile[0])
        p1.append(profile[1])
        p2.append(profile[2])
        p3.append(profile[3])
        p4.append(profile[4])
        p5.append(profile[5])
        article_category.append(article_id_to_category_int[i])
        
        
        for _ in range(num_negatives):
            negative_item = np.random.choice(all_article_ids)
            while (u, negative_item) in user_item_set:
                negative_item = np.random.choice(all_article_ids)
            user_ids.append(u)
            user_click_history.append(userid_to_article_history[u])
            p0.append(profile[0])
            p1.append(profile[1])
            p2.append(profile[2])
            p3.append(profile[3])
            p4.append(profile[4])
            p5.append(profile[5])
            
            article_category.append(article_id_to_category_int[negative_item])
            
            articles.append(negative_item)
            labels.append(0)
        articles.append(i)
        labels.append(1)
    
    user_ids, user_click_history, p0, p1, p2, p3, p4, p5, articles,article_category, labels = shuffle(user_ids,user_click_history, p0, p1, p2, p3, p4, p5, articles,article_category, labels, random_state=0)

    return pd.DataFrame(list(zip(user_ids,user_click_history,p0, p1, p2, p3, p4, p5, articles,article_category, labels)), columns=["user_id","user_history","p0", "p1", "p2", "p3", "p4", "p5", "article_id","article_category", "labels"])



df_train = negative_sampling(train_true, all_article_ids, "user_id", "article_id")

In [None]:
def fix_dftrain(df, column, max_len, padding):
    i = 0
    for i in tqdm(range(max_len)):
        df[column + "_" + str(i)] = df[column].apply(lambda x: x[i] if i < len(x) else padding)
    #df.drop(column, axis=1, inplace=True)
    return df

df_train = fix_dftrain(df_train, "user_history", 10, 0)
df_train.drop(columns=["user_history"], inplace=True)
df_train.head()

In [None]:
#df_train[df_train["user_id"]==1752]

In [None]:
# For each user; for each item the user has interacted with in the test set;
    # Sample 99 items the user has not interacted with in the past and add the one test item  
    
def negative_sample_testset(ordiginal_df, df_test, all_article_ids, user_id, article_id):
    test_user_item_set = set(zip(df_test[user_id], df_test[article_id]))
    user_interacted_items = ordiginal_df.groupby(user_id)[article_id].apply(list).to_dict()
    users = []
    p0, p1, p2, p3, p4, p5, p6, p7, p8, p9 = [], [], [], [], [], [], [], [], [], []
    res_arr = []
    article_category, article_sub_category = [], []
    
    userid_to_true_item = {} # keep track of the real items
    for (u,i) in tqdm(test_user_item_set):
        interacted_items = user_interacted_items[u]
        not_interacted_items = set(all_article_ids) - set(interacted_items)
        selected_not_interacted = list(np.random.choice(list(not_interacted_items), 99))
        test_items =[i] + selected_not_interacted 
        temp = []
        profile = userid_to_profile[u]
        for j in range(len(test_items)):
            temp.append([u,
                         userid_to_article_history[u], 
                         profile[0],
                         profile[1],
                         profile[2],
                         profile[3],
                         profile[4],
                         profile[5], 
                         test_items[j], article_id_to_category_int[test_items[j]]])
        #            user_click_history.append(userid_to_article_history[u])

        res_arr.append(temp)
        userid_to_true_item[u] = i 
    X_test = np.array(res_arr)
    X_test = X_test.reshape(-1, X_test.shape[-1])
    df_test = pd.DataFrame(X_test, columns=["user_id",
                                            "click_history", 
                                            "p0", 
                                            "p1", 
                                            "p2", 
                                            "p3", 
                                            "p4", 
                                            "p5",
                                            "article_id", 
                                            "category"])
    return X_test, df_test, userid_to_true_item
X_test, df_test, userid_to_true_item = negative_sample_testset(behaviors, test_true, behaviors["article_id"].unique(), "user_id", "article_id")
    
    

In [None]:
#df_test[df_test["user_id"] == 4744]

In [None]:
def fix_dftest(df, column, max_len, padding):
    i = 0
    for i in tqdm(range(max_len)):
        df[column + "_" + str(i)] = df[column].apply(lambda x: x[i] if i < len(x) else padding)
    #df.drop(column, axis=1, inplace=True)
    return df

df_test = fix_dftest(df_test, "click_history", 10, 0)
df_test.drop(columns=["click_history"], inplace=True)

# 4. Models

In [None]:
def getHitRatio(ranklist, gtItem):
    for item in ranklist:
        if item == gtItem:
            return 1
    return 0

def getNDCG(ranklist, gtItem):
    for i in range(len(ranklist)):
        item = ranklist[i]
        if item == gtItem:
            return math.log(2) / math.log(i+2)
    return 0

# 4.1 NeuMF - without features

In [None]:

def evaluate_one_rating_neumf(model, user_id, all_articles, true_item):
    ### Reshaping to make it on the right shape ###
    expanded_user_id = np.array([user_id]*100).reshape((100,1))
    all_articles = np.array(all_articles).reshape(-1,1)
    
    # predictions
    predictions = model.predict([expanded_user_id, all_articles])
    predicted_labels = np.squeeze(predictions)
    print(predicted_labels)
    top_ten_items = [all_articles[i] for i in np.argsort(predicted_labels)[::-1][0:10].tolist()]
    hr = getHitRatio(top_ten_items, true_item)
    ndcg = getNDCG(top_ten_items, true_item)
    hr_five = getHitRatio(top_ten_items[:5], true_item)
    ndcg_five = getNDCG(top_ten_items[:5], true_item)
    return hr, ndcg, hr_five, ndcg_five

def evalaute_model_neumf(model, df_test, userid_to_true_item):
    print("Evaluate model")
    hits = []
    ndcgs = []
    hits_five, ndcgs_five = [], []
    users = df_test["user_id"].unique()[:400]
    for user_id in tqdm(users):
        user_df = df_test[df_test["user_id"] == user_id] # get the 100 samples for this user
        true_item = userid_to_true_item[user_id] # get the actual true item in the test set
        all_articles = user_df["article_id"].to_numpy().astype(int) # get all possible articles
        
        ht, ndcg, ht_five, ndcg_five = evaluate_one_rating_neumf(model, user_id, all_articles, true_item)
        hits.append(ht)
        ndcgs.append(ndcg)
        hits_five.append(ht_five)
        ndcgs_five.append(ndcg_five)
    return hits, ndcgs, hits_five, ndcgs_five

In [None]:
num_users = len(behaviors["user_id"].unique())
num_items = len(behaviors["article_id"].unique())
dims = 20
def get_model_neumf(num_users, num_items, dims, dense_layers=[128, 64, 32, 8]):
    user_input = Input(shape=(1,), name="user")
    item_input = Input(shape=(1,), name="item")
    
    mf_user_emb = Embedding(output_dim=dims, 
                            input_dim=num_users, 
                            input_length=1, 
                            embeddings_initializer='he_normal', 
                            embeddings_regularizer=regularizers.l2(0.001),
                            name="mf_user_emb")(user_input)
    mf_item_emb = Embedding(output_dim=dims, 
                            input_dim=num_items, 
                            input_length=1, 
                            embeddings_initializer='he_normal', 
                            embeddings_regularizer=regularizers.l2(0.001),
                            name="mf_item_emb")(item_input)
    
    num_layers = len(dense_layers)
    mlp_user_emb = Embedding(output_dim=int(dense_layers[0] / 2), 
                             input_dim=num_users, 
                             input_length=1, 
                             embeddings_initializer='he_normal', 
                             embeddings_regularizer=regularizers.l2(0.001),
                             name="mlp_user_emb")(user_input)
    mlp_item_emb = Embedding(output_dim=int(dense_layers[0] / 2), 
                             input_dim=num_items, 
                             input_length=1, 
                             embeddings_initializer='he_normal', 
                             embeddings_regularizer=regularizers.l2(0.001),
                             name="mlp_user_item")(item_input)
    
    # Matrix factorization
    mf_user_vecs = Reshape([dims])(mf_user_emb)
    mf_item_vecs = Reshape([dims])(mf_item_emb)
    
    mf_vec = multiply([mf_user_vecs, mf_item_vecs])
    
    #MLP
    mlp_vec = Concatenate()([mlp_user_emb, mlp_item_emb])
    mlp_vector = Flatten()(mlp_vec)
    
    for num_nodes in dense_layers:
        l = Dense(num_nodes, activation="relu")
        mlp_vector = l(mlp_vector)
    
    y = Concatenate()([mf_vec, mlp_vector])
    y = Dense(1, activation="sigmoid", name="pred")(y)
    
    
    model = Model(inputs=[user_input, item_input], outputs=y)
    model.compile(
        optimizer=Adam(0.01),
        loss="binary_crossentropy",
        metrics=["accuracy"],
    )
    return model

model_neumf = get_model_neumf(num_users, num_items, dims)

In [None]:
user_input = df_train.iloc[:, 0].values.reshape((-1,1))
item_input = df_train.iloc[:, 7].values.reshape((-1,1))
labels = df_train.iloc[:, 9].values.reshape((-1,1))
print(user_input.shape, item_input.shape, labels.shape )

In [None]:
all_user_ids = df_train["user_id"].unique()

#user_input = df_train.iloc[:, 0].values.reshape((-1,1))
#profile_input = df_train.iloc[:, 1:6].values
#item_input = df_train.iloc[:, 7].values.reshape((-1,1))
#labels = df_train.iloc[:, 8].values.reshape((-1,1))


train_loss = []
val_loss = []
train_acc = []
val_acc = []

hits_list = []
ndcg_list = []
best_hits = 0
best_ndcgs = 0
best_hits_five = 0
best_ndcgs_five = 0

epochs=10
for epoch in range(epochs):
    hist = model_neumf.fit([user_input, item_input], labels, epochs=1, shuffle=True, verbose=1, validation_split=0.1, batch_size=512)
    
    train_loss.append(hist.history["loss"])
    train_acc.append(hist.history["accuracy"])
    val_loss.append(hist.history["val_loss"])
    val_acc.append(hist.history["val_accuracy"])
    
    #hits, ndcgs, hits_five, ndcgs_five = evalaute_model_neumf( model_neumf, df_test, userid_to_true_item)
    #hits_list.append(np.average(hits))
    #ndcg_list.append(np.average(ndcgs))
    
    #temp_hits = np.average(hits)
    #temp_ndcgs = np.average(ndcgs)
    #if (temp_hits > best_hits):
    #    best_hits = temp_hits
    #    best_ndcgs = temp_ndcgs
    #    best_hits_five = np.average(hits_five)
    #    best_ndcgs_five = np.average(ndcgs_five)

In [None]:
print("Hit @ 10: {:.2f}".format(best_hits))
print("ncdgs @ 10: {:.2f}".format(best_ndcgs))
print("Hit @ 10: {:.2f}".format(best_hits_five))
print("ncdgs @ 10: {:.2f}".format(best_ndcgs_five))

In [None]:
hits, ndcgs, hits_five, ndcgs_five = evalaute_model_neumf( model_neumf, df_test, userid_to_true_item)

In [None]:
print(np.average(hits))

In [None]:
import matplotlib.pyplot as plt
sns.set_style("darkgrid")
plt.plot(train_loss)
plt.plot(val_loss)
plt.title('Loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['Train', 'Val'], loc='upper left')
plt.savefig("final_loss.pdf")
plt.show()

# 4.2 Popularity based

In [None]:
most_popular_df = pd.DataFrame(behaviors["article_id"].value_counts())
most_popular_df = most_popular_df.reset_index()
most_popular_df.columns=["article_id", "counts"]
most_popular_articles = most_popular_df["article_id"].values

In [None]:
def popularity_recommender(top_n, user_interactions, most_popular_articles,num_unique_users):
    """
    params: 
        top_n: number of articles to recommend
    """
    all_article_ids = behaviors["article_id"].unique()
    recommendations = {}
    for (u,i) in tqdm(user_interactions.items()):
        interacted_items = user_interactions[u]
        popular_items_not_interacted_with = []
        for i in range(10):
            counter = i
            popular_item = most_popular_articles[i]
            while popular_item in interacted_items:
                counter += 1
                popular_item = most_popular_articles[counter]
            popular_items_not_interacted_with.append(popular_item)
        recommendations[u] = list(popular_items_not_interacted_with)
    return recommendations

user_interactions = df_train.groupby("user_id")["article_id"].apply(list).to_dict()
num_unique_users = len(df_train["user_id"].unique())
recs = popularity_recommender(10, user_interactions, most_popular_articles, num_unique_users)

In [None]:
users = df_test["user_id"].unique()[:400]
hit_ten = 0
hit_five = 0
for user_id in tqdm(users):
    user_df = df_test[df_test["user_id"]==user_id]
    true_item = userid_to_true_item[user_id] # get the actual true item in the test set
    recommendations = recs[user_id]
    five_recommendations = recommendations[:5]
    if true_item in recommendations:
        hit_ten+=1
    if true_item in five_recommendations:
        hit_five += 1
print(hit_ten/len(users))
print(hit_five / len(users))

# 4.3 Wide and deep - with features

In [None]:
def evaluate_one_rating_wide(model, user_id, user_profiles, all_articles,categories, true_item):
    ### Reshaping to make it on the right shape ###
    expanded_user_id = np.array([user_id]*100).reshape((100,1))
    all_articles = np.array(all_articles).reshape(-1,1)
    
    # predictions
    predictions = model.predict([expanded_user_id, user_profiles, all_articles,categories])
    predicted_labels = np.squeeze(predictions)
    #print(predicted_labels)
    top_ten_items = [all_articles[i] for i in np.argsort(predicted_labels)[::-1][0:10].tolist()]
    hr = getHitRatio(top_ten_items, true_item)
    ndcg = getNDCG(top_ten_items, true_item)
    hr_five = getHitRatio(top_ten_items[:5], true_item)
    ndcg_five = getNDCG(top_ten_items[:5], true_item)
    return hr, ndcg, hr_five, ndcg_five

def evalaute_model_wide(model, df_test, userid_to_true_item):
    print("Evaluate model")
    hits = []
    ndcgs = []
    hits_five, ndcgs_five = [], []
    users = df_test["user_id"].unique()[:400]
    for user_id in tqdm(users):
        user_df = df_test[df_test["user_id"] == user_id] # get the 100 samples for this user
        true_item = userid_to_true_item[user_id] # get the actual true item in the test set
        all_articles = user_df["article_id"].to_numpy().astype(int) # get all possible articles
        user_profiles = user_df.iloc[:, 1:7].to_numpy().astype(int)# get the user_profile
        
        categories = user_df.iloc[:, 8].to_numpy().astype(int)
        
        ht, ndcg, ht_five, ndcg_five = evaluate_one_rating_wide(model, user_id, user_profiles, all_articles,categories, true_item)
        hits.append(ht)
        ndcgs.append(ndcg)
        hits_five.append(ht_five)
        ndcgs_five.append(ndcg_five)
    return hits, ndcgs,hits_five,ndcgs_five

In [None]:
num_users = len(behaviors["user_id"].unique()) 
num_items = len(behaviors["article_id"].unique()) 
num_categories = len(behaviors["category_int"].unique()) 
dims = 20

In [None]:
def get_model_wide(num_users, num_items, dims, dense_layers=[128, 64, 32, 8]):
    #### Matrix factorization ####
    user_id_input = Input(shape=[1], name="user_id")
    item_id_input = Input(shape=[1], name="item_id")
    user_embedding = Embedding(input_dim=num_users, 
                               output_dim=dims, 
                               input_length=1, 
                               embeddings_initializer='he_normal', 
                               embeddings_regularizer=regularizers.l2(0.001),
                               name="user_embedding")(user_id_input)
    item_embedding = Embedding(input_dim=num_items, 
                               output_dim=dims, 
                               embeddings_initializer='he_normal', 
                               embeddings_regularizer=regularizers.l2(0.001),
                               name="item_embedding")(item_id_input)
    
    user_flatten = Flatten()(user_embedding)
    item_flatten = Flatten()(item_embedding)
    mf_vec = Concatenate()([user_flatten, item_flatten])
    
    x_deep = Dense(128, activation="relu", kernel_initializer='he_uniform',kernel_regularizer=regularizers.l2(0.001))(mf_vec)
    x_deep = Dropout(0.2)(x_deep)
    x_deep = Dense(64, activation="relu",
                   kernel_initializer='he_uniform', 
                   kernel_regularizer=regularizers.l2(0.001))(x_deep)
    x_deep = Dropout(0.2)(x_deep)
    
    #### Wide part ####
    
    user_profile_input = Input(shape=(6,), name="user_profile")
    item_category_input = Input(shape=(1,), name="category_input")
    
    item_category_emb = Embedding(input_dim=num_categories, output_dim=dims, name="category_emd", embeddings_regularizer=regularizers.l2(0.001))(item_category_input)
    user_profile_emb = Embedding(input_dim=num_categories, output_dim=dims,
                                 embeddings_regularizer=regularizers.l2(0.001), name="profile_emb")(user_profile_input)

    item_category_flatten = Flatten()(item_category_emb)
    user_profile_flatten = Flatten()(user_profile_emb)
    
    wide_features = Concatenate()([item_category_flatten,  user_profile_flatten])
    
    x_wide = Dense(128, activation="relu",kernel_initializer='he_uniform', kernel_regularizer=regularizers.l2(0.001))(wide_features)
    x_wide = Dropout(0.5)(x_wide)
    x_wide = Dense(64, activation="relu",kernel_initializer='he_uniform', kernel_regularizer=regularizers.l2(0.001))(x_wide)
    x_wide = Dropout(0.5)(x_wide)
    
    final = Concatenate()([x_deep,x_wide])
    x = Dense(128, kernel_initializer='he_uniform',activation="relu")(final)
    x = Dropout(0.5)(x)
    y = Dense(1, activation="sigmoid")(x)
    
    
    model = Model(inputs=[user_id_input, user_profile_input, item_id_input, item_category_input], outputs=y)
    model.compile(
        optimizer=Adam(0.001),
        loss="binary_crossentropy",
        metrics=["accuracy"],
    )
    return model

model_wide = get_model_wide(num_users, num_items, dims)

In [None]:
###### Training ########
user_input = df_train.iloc[:, 0].values.reshape((-1,1))
profile_input = df_train.iloc[:, 1:7].values
item_input = df_train.iloc[:, 7].values.reshape((-1,1))
labels = df_train.iloc[:, 9].values.reshape((-1))
category_input = df_train.iloc[:, 8].values.reshape((-1,1))
print(user_input.shape,profile_input.shape, item_input.shape,category_input.shape, labels.shape )

In [None]:
all_user_ids = df_train["user_id"].unique()

#user_input = df_train.iloc[:, 0].values.reshape((-1,1))
#profile_input = df_train.iloc[:, 1:6].values
#item_input = df_train.iloc[:, 7].values.reshape((-1,1))
#labels = df_train.iloc[:, 8].values.reshape((-1,1))

train_loss = []
val_loss = []
train_acc = []
val_acc = []

hits_list = []
ndcg_list = []
best_hits = 0
best_ndcgs = 0


epochs=25

for epoch in range(epochs):
    hist = model_wide.fit([user_input, profile_input, item_input,category_input], labels,validation_split=0.1, epochs=1, shuffle=True, verbose=1, batch_size=32)
    
    train_loss.append(hist.history["loss"])
    train_acc.append(hist.history["accuracy"])
    val_loss.append(hist.history["val_loss"])
    val_acc.append(hist.history["val_accuracy"])
    
    #hits, ndcgs, hits_five, ndcgs_five = evalaute_model_wide( model_wide, df_test, userid_to_true_item)
    #hits_list.append(np.average(hits))
    #ndcg_list.append(np.average(ndcgs))
    
    #temp_hits = np.average(hits)
    #temp_ndcgs = np.average(ndcgs)
    #if (temp_hits > best_hits):
    #    best_hits = temp_hits
    #    best_hits_five = np.average(hits_five)
    #    best_ndcgs_five = np.average(ndcgs_five)
    #    best_ndcgs = temp_ndcgs

In [None]:
print(best_hits)
print(best_ndcgs)
print(best_hits_five)
print(best_ndcgs_five)

In [None]:
hits, ndcgs, hits_five, ndcgs_five = evalaute_model_wide( model_wide, df_test, userid_to_true_item)
print(np.average(hits))

In [None]:
import matplotlib.pyplot as plt
sns.set_style("darkgrid")
plt.plot(train_loss)
plt.plot(val_loss)
plt.title('Loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['Train', 'Val'], loc='upper left')
plt.savefig("final_loss.pdf")
plt.show()

# 4.4 NCF

In [None]:
def evaluate_one_rating_ncf(model, user_id, all_articles, true_item):
    ### Reshaping to make it on the right shape ###
    expanded_user_id = np.array([user_id]*100).reshape((100,1))
    all_articles = np.array(all_articles).reshape(-1,1)
    
    # predictions
    predictions = model.predict([expanded_user_id, all_articles])
    predicted_labels = np.squeeze(predictions)
    top_ten_items = [all_articles[i] for i in np.argsort(predicted_labels)[::-1][0:10].tolist()]
    hr = getHitRatio(top_ten_items, true_item)
    ndcg = getNDCG(top_ten_items, true_item)
    hr_five = getHitRatio(top_ten_items[:5], true_item)
    ndcg_five = getNDCG(top_ten_items[:5], true_item)
    return hr, ndcg,hr_five,ndcg_five

def evalaute_model_ncf(model, df_test, userid_to_true_item):
    print("Evaluate model")
    hits = []
    ndcgs = []
    hits_five, ndcgs_five = [], []
    users = df_test["user_id"].unique()[:400]
    for user_id in tqdm(users):
        user_df = df_test[df_test["user_id"] == user_id] # get the 100 samples for this user
        true_item = userid_to_true_item[user_id] # get the actual true item in the test set
        all_articles = user_df["article_id"].to_numpy().astype(int) # get all possible articles
        
        ht, ndcg, ht_five, ndcg_five = evaluate_one_rating_ncf(model, user_id, all_articles, true_item)
        hits.append(ht)
        ndcgs.append(ndcg)
        hits_five.append(ht_five)
        ndcgs_five.append(ndcg_five)
    return hits, ndcgs,hits_five,ndcgs_five

In [None]:
def get_model_ncf(num_users, num_items, dims, dense_layers=[128, 64, 32, 8]):
    user_input = Input(shape=(1,), name="user")
    item_input = Input(shape=(1,), name="item")
    
    user_emb = Embedding(output_dim=dims, 
                         input_dim=num_users, 
                         input_length=1, 
                         embeddings_initializer='he_normal', 
                         embeddings_regularizer=regularizers.l2(0.001),
                         name="mf_user_emb")(user_input)
    item_emb = Embedding(output_dim=dims, 
                         input_dim=num_items, 
                         input_length=1, 
                         embeddings_initializer='he_normal', 
                         embeddings_regularizer=regularizers.l2(0.001),
                         name="mf_item_emb")(item_input)
    
    user_vecs = Reshape([dims])(user_emb)
    item_vecs = Reshape([dims])(item_emb)
    
    y = Dot(1, normalize=False)([user_vecs, item_vecs])
    
    y = Dense(1, activation="sigmoid")(y)
    
    
    model = Model(inputs=[user_input, item_input], outputs=y)
    model.compile(
        optimizer=Adam(0.01),
        loss="mse",
        metrics=["accuracy"],
    )
    return model

model_ncf = get_model_ncf(num_users, num_items, dims)

In [None]:
user_input = df_train.iloc[:, 0].values.reshape((-1,1))
item_input = df_train.iloc[:, 7].values.reshape((-1,1))
labels = df_train.iloc[:, 9].values.reshape((-1,1))
print(user_input.shape, item_input.shape, labels.shape )

In [None]:

all_user_ids = df_train["user_id"].unique()

#user_input = df_train.iloc[:, 0].values.reshape((-1,1))
#profile_input = df_train.iloc[:, 1:6].values
#item_input = df_train.iloc[:, 7].values.reshape((-1,1))
#labels = df_train.iloc[:, 8].values.reshape((-1,1))


train_loss = []
val_loss = []
train_acc = []
val_acc = []

hits_list = []
ndcg_list = []
best_hits = 0
best_ndcgs = 0
best_hits_five = 0
best_ndcgs_five = 0

epochs=3
for epoch in range(epochs):
    hist = model_ncf.fit([user_input, item_input], labels, epochs=1, shuffle=True, verbose=1, validation_split=0.1)
    
    train_loss.append(hist.history["loss"])
    train_acc.append(hist.history["accuracy"])
    val_loss.append(hist.history["val_loss"])
    val_acc.append(hist.history["val_accuracy"])
    
    hits, ndcgs, ht_five, ndcg_five = evalaute_model_ncf( model_ncf, df_test, userid_to_true_item)
    hits_list.append(np.average(hits))
    ndcg_list.append(np.average(ndcgs))
    
    temp_hits = np.average(hits)
    temp_ndcgs = np.average(ndcgs)
    if (temp_hits > best_hits):
        best_hits = temp_hits
        best_hits_five = np.average(ht_five)
        best_ndcgs_five = np.average(ndcg_five)
        best_ndcgs = temp_ndcgs
    

In [None]:
print(best_hits)
print(best_ndcgs)
print(best_hits_five)
print(best_ndcgs_five)

In [1]:
sns.set_style("darkgrid")
plt.plot(train_loss)
plt.plot(val_loss)
plt.title('Loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['Train', 'Val'], loc='upper left')
plt.savefig("final_loss.pdf")
plt.show()

NameError: name 'sns' is not defined

# 4.5 NeuMF with features

In [None]:
def evaluate_one_rating_neumffeat(model, user_id, user_profiles, all_articles,categories, true_item):
    ### Reshaping to make it on the right shape ###
    expanded_user_id = np.array([user_id]*100).reshape((100,1))
    all_articles = np.array(all_articles).reshape(-1,1)
    
    # predictions
    predictions = model.predict([expanded_user_id, user_profiles, all_articles,categories])
    predicted_labels = np.squeeze(predictions)
    #print(predicted_labels)
    top_ten_items = [all_articles[i] for i in np.argsort(predicted_labels)[::-1][0:10].tolist()]
    hr = getHitRatio(top_ten_items, true_item)
    ndcg = getNDCG(top_ten_items, true_item)
    hr_five = getHitRatio(top_ten_items[:5], true_item)
    ndcg_five = getNDCG(top_ten_items[:5], true_item)
    return hr, ndcg, hr_five, ndcg_five

def evalaute_model_neumffeat(model, df_test, userid_to_true_item):
    print("Evaluate model")
    hits = []
    ndcgs = []
    hits_five, ndcgs_five = [], []
    users = df_test["user_id"].unique()[:400]
    for user_id in tqdm(users):
        user_df = df_test[df_test["user_id"] == user_id] # get the 100 samples for this user
        true_item = userid_to_true_item[user_id] # get the actual true item in the test set
        all_articles = user_df["article_id"].to_numpy().astype(int) # get all possible articles
        user_profiles = user_df.iloc[:, 1:7].to_numpy().astype(int)# get the user_profile
        
        categories = user_df.iloc[:, 8].to_numpy().astype(int)
        
        ht, ndcg, ht_five, ndcg_five = evaluate_one_rating_neumffeat(model, user_id, user_profiles, all_articles,categories, true_item)
        hits.append(ht)
        ndcgs.append(ndcg)
        hits_five.append(ht_five)
        ndcgs_five.append(ndcg_five)
    return hits, ndcgs,hits_five,ndcgs_five

In [None]:
def get_model_neumffeat(num_users, num_items, dims, dense_layers=[128, 64, 32, 8]):
    user_input = Input(shape=(1,), name="user")
    item_input = Input(shape=(1,), name="item")
    
    mf_user_emb = Embedding(output_dim=dims, 
                            input_dim=num_users, 
                            input_length=1, 
                            embeddings_initializer='he_normal', 
                            embeddings_regularizer=regularizers.l2(0.001),
                            name="mf_user_emb")(user_input)
    mf_item_emb = Embedding(output_dim=dims, 
                            input_dim=num_items, 
                            input_length=1, 
                            embeddings_initializer='he_normal', 
                            embeddings_regularizer=regularizers.l2(0.001),
                            name="mf_item_emb")(item_input)
    
    num_layers = len(dense_layers)
    mlp_user_emb = Embedding(output_dim=int(dense_layers[0] / 2), 
                             input_dim=num_users, 
                             input_length=1, 
                             embeddings_initializer='he_normal', 
                             embeddings_regularizer=regularizers.l2(0.001),
                             name="mlp_user_emb")(user_input)
    mlp_item_emb = Embedding(output_dim=int(dense_layers[0] / 2), 
                             input_dim=num_items, 
                             input_length=1, 
                             embeddings_initializer='he_normal', 
                             embeddings_regularizer=regularizers.l2(0.001),
                             name="mlp_user_item")(item_input)
    
    # Matrix factorization
    mf_user_vecs = Reshape([dims])(mf_user_emb)
    mf_item_vecs = Reshape([dims])(mf_item_emb)
    
    mf_vec = multiply([mf_user_vecs, mf_item_vecs])
    
    #MLP
    profile_input = Input(shape=(6,), name="user_profile")
    category_input = Input(shape=(1,), name="category_input")
    sub_category_input = Input(shape=(1,), name="subcategory_input")
    
    item_category_emb = Embedding(input_dim=num_categories, 
                                  output_dim=int(dense_layers[0] / 2), 
                                  name="category_emd", 
                                  embeddings_regularizer=regularizers.l2(0.001))(category_input)
    user_profile_emb = Embedding(input_dim=num_categories, 
                                 output_dim=int(dense_layers[0] / 2),
                                 embeddings_regularizer=regularizers.l2(0.001), 
                                 name="profile_emb")(profile_input)

    item_category_flatten = Flatten()(item_category_emb)
    user_profile_flatten = Flatten()(user_profile_emb)
    
    wide_features = Concatenate()([item_category_flatten,  user_profile_flatten])
    mlp_vector = Flatten()(wide_features)
    for num_dense in dense_layers:
        l = Dense(num_dense, activation="relu")
        mlp_vector = l(mlp_vector)
        mlp_vector = Dropout(0.2)(mlp_vector)
    

    
    mlp_vec = Concatenate()([mlp_user_emb, mlp_item_emb])
    mlp_vector = Flatten()(mlp_vec)
    
    y = Concatenate()([mf_vec, mlp_vector])
    y = Dense(1, activation="sigmoid", name="pred")(y)
    
    
    model = Model(inputs=[user_input, profile_input, item_input,category_input], outputs=y)
    model.compile(
        optimizer=Adam(0.01),
        loss="binary_crossentropy",
        metrics=["accuracy"],
    )
    return model

model_neumffeat = get_model_neumffeat(num_users, num_items, dims)

In [None]:
###### Training ########
user_input = df_train.iloc[:, 0].values.reshape((-1,1))
profile_input = df_train.iloc[:, 1:7].values
item_input = df_train.iloc[:, 7].values.reshape((-1,1))
labels = df_train.iloc[:, 9].values.reshape((-1))
category_input = df_train.iloc[:, 8].values.reshape((-1,1))
print(user_input.shape,profile_input.shape, item_input.shape,category_input.shape, labels.shape )

In [None]:
all_user_ids = df_train["user_id"].unique()

#user_input = df_train.iloc[:, 0].values.reshape((-1,1))
#profile_input = df_train.iloc[:, 1:6].values
#item_input = df_train.iloc[:, 7].values.reshape((-1,1))
#labels = df_train.iloc[:, 8].values.reshape((-1,1))

train_loss = []
val_loss = []
train_acc = []
val_acc = []

hits_list = []
ndcg_list = []
best_hits = 0
best_ndcgs = 0


epochs=2

for epoch in range(epochs):
    hist = model_neumffeat.fit([user_input, profile_input, item_input,category_input], labels,validation_split=0.1, epochs=1, shuffle=True, verbose=1)
    
    train_loss.append(hist.history["loss"])
    train_acc.append(hist.history["accuracy"])
    val_loss.append(hist.history["val_loss"])
    val_acc.append(hist.history["val_accuracy"])
    
    hits, ndcgs, hits_five, ndcgs_five = evalaute_model_neumffeat( model_neumffeat, df_test, userid_to_true_item)
    hits_list.append(np.average(hits))
    ndcg_list.append(np.average(ndcgs))
    
    temp_hits = np.average(hits)
    temp_ndcgs = np.average(ndcgs)
    if (temp_hits > best_hits):
        best_hits = temp_hits
        best_hits_five = np.average(hits_five)
        best_ndcgs_five = np.average(ndcgs_five)
        best_ndcgs = temp_ndcgs

In [None]:
print(best_hits)
print(best_ndcgs)
print(best_hits_five)
print(best_ndcgs_five)

In [None]:
sns.set_style("darkgrid")
plt.plot(train_loss)
plt.plot(val_loss)
plt.title('Loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['Train', 'Val'], loc='upper left')
plt.savefig("final_loss.pdf")
plt.show()