In [None]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import tensorflow as tf
from tqdm import tqdm
import re
import scipy
import tensorflow
from tensorflow.keras.layers import Input,Flatten, Embedding, Reshape, Multiply, Dropout, Dense, Concatenate, GlobalAveragePooling1D
from tensorflow.keras.layers import Layer, SpatialDropout1D, GlobalMaxPooling1D, Bidirectional, GRU, LSTM
from tensorflow.keras.layers import Dot, TimeDistributed, BatchNormalization, Add, multiply
from tensorflow.keras import initializers, regularizers, constraints
from tensorflow.keras import Model
from tensorflow.keras.optimizers import Adam
#import keras.backend as K
from sklearn.utils import shuffle
import seaborn as sns
import math
import collections
from collections import Counter
import matplotlib.pyplot as plt
import random

In [None]:
SEED=42

In [None]:
PATH = "../data/mind_small/"
news = pd.read_csv(PATH + "news.tsv",header=None, sep="\t")
behaviors = pd.read_csv(PATH + "behaviors.tsv", header=None, sep="\t")
news.columns = ["news_id", "category", "sub_category", "title", "abstract", "url", "title_entities", "abstract_entities"]
behaviors.columns = ["idx", "user_id", "time", "history", "impressions"]
behaviors = behaviors.drop_duplicates(["user_id", "history"]) 
behaviors.dropna(subset=["user_id", "history"], inplace=True)

In [None]:
sessions = {}
for idx, row in behaviors.iterrows():
    sessions[row["user_id"]] = row["history"].split(" ")

users = []
clicks = []
for k, v in sessions.items():
    for elem in v:
        users.append(k)
        clicks.append(elem)

tuples = list(zip(users, clicks))
interactions = pd.DataFrame(tuples, columns=["user", "news_id"])
interactions = interactions[:10000]

In [None]:
merged = interactions.merge(news, on=["news_id"])
merged.head(1)

In [None]:
print(len(merged))
merged = merged.drop_duplicates()
print(len(merged))

In [None]:
merged.dropna(subset=["abstract"], inplace=True)

# 1. Preprocessing

In [None]:
# remove users which have fewer than 5 interacations
print("Len before removal: ",len(merged))
_keys = merged["user"].value_counts()[merged["user"].value_counts() > 5].keys()
merged = merged[merged["user"].isin(_keys)]
print("Len after removal: ",len(merged))


user_enc = LabelEncoder()
article_enc = LabelEncoder()
merged["user_id"] = user_enc.fit_transform(merged["user"].values)
merged["article_id"] = article_enc.fit_transform(merged["news_id"].values)




In [None]:
import nltk
from nltk.corpus import stopwords
# Helper functions
def _removeNonAscii(s):
    return "".join(i for i in s if  ord(i)<128)

def make_lower_case(text):
    return text.lower()

def remove_stop_words(text):
    text = text.split()
    stops = set(stopwords.words("english"))
    text = [w for w in text if not w in stops]
    text = " ".join(text)
    return text

def remove_html(text):
    html_pattern = re.compile('<.*?>')
    return html_pattern.sub(r'', text)

def remove_punctuation(text):
    text = re.sub(r'[^\w\s]', '', text)
    return text

def text_to_list(text):
    text = text.split(" ")
    return text

In [None]:
def clean_title(df):
    df["title_cleaned"] = df.title.apply(func = make_lower_case)
    df["title_cleaned"] = df.title_cleaned.apply(func = remove_stop_words)
    df["title_cleaned"] = df.title_cleaned.apply(func = remove_punctuation)
    return df
def clean_abstract(df):
    df["abstract_cleaned"] = df.abstract.apply(func = make_lower_case)
    df["abstract_cleaned"] = df.abstract_cleaned.apply(func = remove_stop_words)
    df["abstract_cleaned"] = df.abstract_cleaned.apply(func = remove_punctuation)
    return df
def hyphen_to_underline(category):
    """
    Convert hyphen to underline for the subcategories. So that Tfidf works correctly
    """
    return category.replace("-","_")
merged = clean_title(merged)
merged = clean_abstract(merged)
merged["subcategory_cleaned"] = merged["sub_category"].apply(func = hyphen_to_underline)

In [None]:
category_enc = LabelEncoder()
subcategory_enc = LabelEncoder()
merged["subcategory_int"] = subcategory_enc.fit_transform(merged["subcategory_cleaned"].values)
merged["category_int"] = subcategory_enc.fit_transform(merged["category"].values)

In [None]:
users = merged["user_id"].unique()
userid_to_profile = collections.defaultdict(list)
for user_id in tqdm(users):
    user_subcat = merged[merged["user_id"] == user_id]["subcategory_int"].values.tolist()
    counter = Counter(user_subcat)
    s = sorted(user_subcat, key=lambda x: (counter[x], x), reverse=True)
    final_subcategories = []
    for elem in s:
        if elem not in final_subcategories:
            final_subcategories.append(elem)
    while len(final_subcategories) < 6:
        final_subcategories.append(0)
    userid_to_profile[user_id] = final_subcategories[:6]

In [None]:
profile_df = pd.DataFrame.from_dict(userid_to_profile, orient="index")
profile_df["user_id"] = profile_df.index
merged = merged.merge(profile_df, on="user_id")
merged = merged.rename(columns={"0": "p0","1": "p1","2": "p2","3": "p3","4": "p4","5": "p5",})

article_id_to_category_int = merged[["article_id", "category_int"]].set_index("article_id").to_dict()
article_id_to_category_int = article_id_to_category_int["category_int"]

article_id_to_subcategory_int = merged[["article_id", "subcategory_int"]].set_index("article_id").to_dict()
article_id_to_subcategory_int = article_id_to_subcategory_int["subcategory_int"]

merged.head(1)

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
MAXLEN=10

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(merged["title_cleaned"].values)
temp = tokenizer.texts_to_sequences(merged["title_cleaned"].values)
temp = pad_sequences(temp, padding="post", maxlen=MAXLEN)
merged["title_tokenized"] = temp.tolist()


In [None]:
num_words_title = len(tokenizer.word_index) + 1

In [None]:
tokenizer_abstract = Tokenizer()
tokenizer_abstract.fit_on_texts(merged["abstract_cleaned"].values)
temp = tokenizer.texts_to_sequences(merged["abstract_cleaned"].values)
temp = pad_sequences(temp, padding="post", maxlen=MAXLEN)
merged["abstract_tokenized"] = temp.tolist()

In [None]:
num_words_abstract = len(tokenizer_abstract.word_index) + 1

In [None]:
articleId_to_title = merged[["article_id", "title_tokenized"]].set_index("article_id").to_dict()["title_tokenized"]
article_to_category = merged[["article_id", "category_int"]].set_index("article_id").to_dict()["category_int"]
article_to_subcategory = merged[["article_id", "subcategory_int"]].set_index("article_id").to_dict()["subcategory_int"]
article_to_abstract = merged[["article_id", "abstract_tokenized"]].set_index("article_id").to_dict()["abstract_tokenized"]

In [None]:
def get_items_interacted(user_id, df):
    interacted_items = df[df["user_id"]==user_id]["article_id"]
    return set(interacted_items if type(interacted_items) == pd.Series else [interacted_items])
SAMPLE_SIZE=99
def get_not_interacted(user_id, df):
    interacted_items = get_items_interacted(user_id, df)
    all_items = set(df["article_id"])
    not_interacted_items = all_items - interacted_items
    random.seed(SEED)
    not_interacted_items = random.sample(not_interacted_items, SAMPLE_SIZE)
    return not_interacted_items

# 2.  Train test split

In [None]:
def train_test_split(df, user_id, article_id, have_timestamp, timestamp):
    """
    params: 
        col_1: user_id
        col_2: article_id
    """
    df_test = df
    if have_timestamp: # if df have timestamp; take last interacted article into test set
        df_test = df_test.sort_values(timestamp).groupby(user_id).tail(1)
    else:
        df_test = df_test.sort_values(user_id).groupby(user_id).tail(1)
    df_train = df.drop(index=df_test.index)
    
    assert df_test.shape[0] + df_train.shape[0] == df.shape[0]
    
    return df_train, df_test
df_train_true, df_test_true = train_test_split(merged, "user_id", "article_id", False, 0)

In [None]:
def get_userid_to_article_history(df):
    userid_to_article_history = {}
    for user_id in tqdm(df["user_id"].unique()):
        click_history = df[df["user_id"] == user_id]["article_id"].values
        if len(click_history) < 10:
            while len(click_history) < 10:
                click_history = np.append(click_history, 0)
        if len(click_history) > 10:
            click_history = click_history[:10]
        userid_to_article_history[user_id] = click_history
    return userid_to_article_history
userid_to_article_history = get_userid_to_article_history(df_train_true)

In [None]:
all_article_ids = merged["article_id"].unique()

def negative_sampling(train_df, all_article_ids, user_id, article_id):
    """
    Negative sample training instance; for each positive instance, add 4 negative articles
    
    Return user_ids, news_ids, category_1, category_2, authors_onehotencoded, titles
    """
    
    user_ids, user_click_history, articles, article_category, article_sub_category,titles,abstract, labels = [],[],[], [], [], [], [], []
    p0, p1, p2, p3, p4, p5, p6, p7, p8, p9 = [], [], [], [], [], [], [], [], [], []
    user_item_set = set(zip(train_df[user_id], 
                            train_df[article_id]))
    num_negatives = 4

    for (u, i) in tqdm(user_item_set):
        user_ids.append(u)
        user_click_history.append(userid_to_article_history[u])
        profile = np.array(userid_to_profile[u])
        p0.append(profile[0])
        p1.append(profile[1])
        p2.append(profile[2])
        p3.append(profile[3])
        p4.append(profile[4])
        p5.append(profile[5])
        article_category.append(article_id_to_category_int[i])
        article_sub_category.append(article_id_to_subcategory_int[i])
        titles.append(articleId_to_title[i])
        abstract.append(article_to_abstract[i])
        
        for _ in range(num_negatives):
            negative_item = np.random.choice(all_article_ids)
            while (u, negative_item) in user_item_set:
                negative_item = np.random.choice(all_article_ids)
            user_ids.append(u)
            user_click_history.append(userid_to_article_history[u])
            p0.append(profile[0])
            p1.append(profile[1])
            p2.append(profile[2])
            p3.append(profile[3])
            p4.append(profile[4])
            p5.append(profile[5])
            
            article_category.append(article_id_to_category_int[negative_item])
            article_sub_category.append(article_id_to_subcategory_int[negative_item])
            titles.append(articleId_to_title[negative_item])
            abstract.append(article_to_abstract[negative_item])
            
            articles.append(negative_item)
            labels.append(0)
        articles.append(i)
        labels.append(1)
    
    user_ids, user_click_history, p0, p1, p2, p3, p4, p5, articles,article_category,article_sub_category,titles,abstract, labels = shuffle(user_ids,user_click_history, p0, p1, p2, p3, p4, p5, articles,article_category,article_sub_category,titles,abstract, labels, random_state=0)

    return pd.DataFrame(list(zip(user_ids,user_click_history,p0, p1, p2, p3, p4, p5, articles,article_category,article_sub_category,titles,abstract, labels)), columns=["user_id","user_history","p0", "p1", "p2", "p3", "p4", "p5", "article_id","article_category","article_sub_category","titles","abstract", "labels"])



df_train = negative_sampling(df_train_true, all_article_ids, "user_id", "article_id")

In [None]:
def fix_dftrain(df, column, max_len, padding):
    i = 0
    for i in tqdm(range(max_len)):
        df[column + "_" + str(i)] = df[column].apply(lambda x: x[i] if i < len(x) else padding)
    #df.drop(column, axis=1, inplace=True)
    return df

df_train = fix_dftrain(df_train, "user_history", 10, 0)
df_train.drop(columns=["user_history"], inplace=True)
df_train.head()

In [None]:
# For each user; for each item the user has interacted with in the test set;
    # Sample 99 items the user has not interacted with in the past and add the one test item  
    
def negative_sample_testset(ordiginal_df, df_test, all_article_ids, user_id, article_id):
    test_user_item_set = set(zip(df_test[user_id], df_test[article_id]))
    user_interacted_items = ordiginal_df.groupby(user_id)[article_id].apply(list).to_dict()
    users = []
    p0, p1, p2, p3, p4, p5, p6, p7, p8, p9 = [], [], [], [], [], [], [], [], [], []
    res_arr = []
    article_category, article_sub_category = [], []
    
    userid_to_true_item = {} # keep track of the real items
    for (u,i) in tqdm(test_user_item_set):
        interacted_items = user_interacted_items[u]
        not_interacted_items = set(all_article_ids) - set(interacted_items)
        selected_not_interacted = list(np.random.choice(list(not_interacted_items), 99))
        test_items = selected_not_interacted + [i]
        temp = []
        profile = userid_to_profile[u]
        for j in range(len(test_items)):
            temp.append([u,
                         userid_to_article_history[u], 
                         profile[0],
                         profile[1],
                         profile[2],
                         profile[3],
                         profile[4],
                         profile[5], 
                         test_items[j], 
                         article_id_to_category_int[test_items[j]],
                         article_id_to_subcategory_int[test_items[j]], 
                         articleId_to_title[test_items[j]],
                         article_to_abstract[test_items[j]]
                        ])
        #            user_click_history.append(userid_to_article_history[u])

        res_arr.append(temp)
        userid_to_true_item[u] = i 
    X_test = np.array(res_arr)
    X_test = X_test.reshape(-1, X_test.shape[-1])
    df_test = pd.DataFrame(X_test, columns=["user_id",
                                            "click_history", 
                                            "p0", 
                                            "p1", 
                                            "p2", 
                                            "p3", 
                                            "p4", 
                                            "p5",
                                            "article_id", 
                                            "category", 
                                            "sub_category",
                                            "title",
                                            "abstract"])
    return X_test, df_test, userid_to_true_item
X_test, df_test, userid_to_true_item = negative_sample_testset(merged, df_test_true, merged["article_id"].unique(), "user_id", "article_id")
    
    

In [None]:
def fix_dftest(df, column, max_len, padding):
    i = 0
    for i in tqdm(range(max_len)):
        df[column + "_" + str(i)] = df[column].apply(lambda x: x[i] if i < len(x) else padding)
    #df.drop(column, axis=1, inplace=True)
    return df

df_test = fix_dftest(df_test, "click_history", 10, 0)
df_test.drop(columns=["click_history"], inplace=True)

In [None]:
def getHitRatio(ranklist, gtItem):
    for item in ranklist:
        if item == gtItem:
            return 1
    return 0

def getNDCG(ranklist, gtItem):
    for i in range(len(ranklist)):
        item = ranklist[i]
        if item == gtItem:
            return math.log(2) / math.log(i+2)
    return 0

# 4. NeuMF 

# 4.1 NeuMF no features

In [None]:
num_users = len(merged["user_id"].unique())
num_items = len(merged["article_id"].unique())
dims = 20
def get_model_neumf(num_users, num_items, dims, dense_layers=[128, 64, 32, 8]):
    user_input = Input(shape=(1,), name="user")
    item_input = Input(shape=(1,), name="item")
    
    mf_user_emb = Embedding(output_dim=dims, 
                            input_dim=num_users, 
                            input_length=1, 
                            embeddings_initializer='he_normal', 
                            embeddings_regularizer=regularizers.l2(0.001),
                            name="mf_user_emb")(user_input)
    mf_item_emb = Embedding(output_dim=dims, 
                            input_dim=num_items,
                            embeddings_initializer='he_normal', 
                            embeddings_regularizer=regularizers.l2(0.001),
                            input_length=1, name="mf_item_emb")(item_input)
    
    num_layers = len(dense_layers)
    mlp_user_emb = Embedding(output_dim=int(dense_layers[0] / 2), 
                             input_dim=num_users, 
                             input_length=1,
                             embeddings_initializer='he_normal', 
                            embeddings_regularizer=regularizers.l2(0.001),
                             name="mlp_user_emb")(user_input)
    mlp_item_emb = Embedding(output_dim=int(dense_layers[0] / 2), 
                             input_dim=num_items, 
                             input_length=1, 
                             embeddings_initializer='he_normal', 
                             embeddings_regularizer=regularizers.l2(0.001),
                             name="mlp_user_item")(item_input)
    
    # Matrix factorization
    mf_user_vecs = Reshape([dims])(mf_user_emb)
    mf_item_vecs = Reshape([dims])(mf_item_emb)
    
    mf_vec = multiply([mf_user_vecs, mf_item_vecs])
    
    #MLP
    mlp_vec = Concatenate()([mlp_user_emb, mlp_item_emb])
    mlp_vector = Flatten()(mlp_vec)
    
    for num_nodes in dense_layers:
        l = Dense(num_nodes, activation="relu")
        mlp_vector = l(mlp_vector)
    
    y = Concatenate()([mf_vec, mlp_vector])
    y = Dense(1, activation="sigmoid", name="pred")(y)
    
    
    model = Model(inputs=[user_input, item_input], outputs=y)
    model.compile(
        optimizer=Adam(0.01),
        loss="binary_crossentropy",
        metrics=["accuracy"],
    )
    return model

model_neumf = get_model_neumf(num_users, num_items, dims)

In [None]:
user_input = df_train.iloc[:, 0].values.reshape((-1,1))
item_input = df_train.iloc[:, 7].values.reshape((-1,1))
labels = df_train.iloc[:, 11].values.reshape((-1,1))
print(user_input.shape, item_input.shape, labels.shape )

In [None]:
test_users = df_test.user_id.values
test_items = df_test.article_id.values
test_users, test_items = shuffle(test_users, test_items)
test_users, test_items = test_users, test_items
test_set = zip(test_users[:100], test_items[:100])
hits_ten,hits_five,ndcgs_ten,ndcgs_five = [], [], [], []
h_ten, h_five, n_ten, n_five = [], [], [], []
for (u,i) in tqdm(test_set):
    users = np.array([u]*100)
    categories = np.tile(np.array(article_to_category[i]), 100).reshape(-1,1)
    
    predictions = model_neumf_one_feat.predict([users, items,categories])
    predicted_labels = np.squeeze(predictions)
    top_ten_items = [items[k] for k in np.argsort(predicted_labels)[::-1][0:10].tolist()]
    
    h_ten.append(getHitRatio(top_ten_items, i))
    h_five.append(getHitRatio(top_ten_items[:5], i))
    n_ten.append(getNDCG(top_ten_items, i))
    n_five.append(getNDCG(top_ten_items[:5], i))

In [None]:
hits, ndcgs, hits_five, ndcgs_five = evalaute_model_neumf( model_neumf, df_test, userid_to_true_item)

# NeuMF 1 feature

In [None]:
num_users = len(merged["user_id"].unique())
num_items = len(merged["article_id"].unique())
num_categories = len(merged["category_int"].unique()) 
num_sub_categories = len(merged["subcategory_int"].unique())

dims = 20
def get_model_neumfonefeat(num_users, num_items, dims, dense_layers=[128, 64, 32, 8]):
    user_input = Input(shape=(1,), name="user")
    item_input = Input(shape=(1,), name="item")
    
    mf_user_emb = Embedding(output_dim=dims, 
                            input_dim=num_users, 
                            input_length=1, 
                            embeddings_initializer='he_normal', 
                            embeddings_regularizer=regularizers.l2(0.001),
                            name="mf_user_emb")(user_input)
    mf_item_emb = Embedding(output_dim=dims, 
                            input_dim=num_items, 
                            input_length=1, 
                            embeddings_initializer='he_normal', 
                            embeddings_regularizer=regularizers.l2(0.001),
                            name="mf_item_emb")(item_input)
    
    num_layers = len(dense_layers)
    mlp_user_emb = Embedding(output_dim=int(dense_layers[0] / 2), 
                             input_dim=num_users, 
                             input_length=1, 
                             embeddings_initializer='he_normal', 
                             embeddings_regularizer=regularizers.l2(0.001),
                             name="mlp_user_emb")(user_input)
    mlp_item_emb = Embedding(output_dim=int(dense_layers[0] / 2), 
                             input_dim=num_items, 
                             input_length=1, 
                             embeddings_initializer='he_normal', 
                            embeddings_regularizer=regularizers.l2(0.001),
                             name="mlp_user_item")(item_input)
    
    # Matrix factorization
    mf_user_vecs = Reshape([dims])(mf_user_emb)
    mf_item_vecs = Reshape([dims])(mf_item_emb)
    
    mf_vec = multiply([mf_user_vecs, mf_item_vecs])
    
    #MLP
    category_input = Input(shape=(1,), name="category_input")
    
    item_category_emb = Embedding(input_dim=num_categories, output_dim=int(dense_layers[0] / 2), name="category_emd", embeddings_regularizer=regularizers.l2(0.001))(category_input)

    item_category_flatten = Flatten()(item_category_emb)
    user_flatten = Flatten()(mlp_user_emb)
    item_flatten = Flatten()(mlp_item_emb)
    
    
    wide_features = Concatenate()([item_category_flatten,user_flatten, item_flatten])
    mlp_vector = Flatten()(wide_features)
    for num_dense in dense_layers:
        l = Dense(num_dense, activation="relu")
        mlp_vector = l(mlp_vector)
        mlp_vector = Dropout(0.2)(mlp_vector)
    

    
    mlp_vec = Concatenate()([mlp_user_emb, mlp_item_emb])
    mlp_vector = Flatten()(mlp_vec)
    
    y = Concatenate()([mf_vec, mlp_vector])
    y = Dense(1, activation="sigmoid", name="pred")(y)
    
    
    model = Model(inputs=[user_input, item_input,category_input], outputs=y)
    model.compile(
        optimizer=Adam(0.01),
        loss="binary_crossentropy",
        metrics=["accuracy"],
    )
    return model

model_neumf_one_feat = get_model_neumfonefeat(num_users, num_items, dims)

In [None]:
###### Training ########
user_input = df_train.user_id.values
articles = df_train.article_id.values
category = df_train.article_category.values
labels = df_train.labels.values
epochs = 3
for epoch in range(epochs):
    hist = model_neumf_one_feat.fit([user_input,articles,category], labels, validation_split=0.1, epochs=1, shuffle=True)

In [None]:
test_users = df_test.user_id.values
test_items = df_test.article_id.values
test_users, test_items = shuffle(test_users, test_items)
test_users, test_items = test_users, test_items
test_set = zip(test_users[:100], test_items[:100])
hits_ten,hits_five,ndcgs_ten,ndcgs_five = [], [], [], []
h_ten, h_five, n_ten, n_five = [], [], [], []
for (u,i) in tqdm(test_set):
    not_interacted_items = get_not_interacted(u, merged)
    users = np.array([u]*100)
    items = np.array([i]+not_interacted_items)
    categories = np.tile(np.array(article_to_category[i]), 100).reshape(-1,1)
    
    predictions = model_neumf_one_feat.predict([users, items,categories])
    predicted_labels = np.squeeze(predictions)
    top_ten_items = [items[k] for k in np.argsort(predicted_labels)[::-1][0:10].tolist()]
    
    h_ten.append(getHitRatio(top_ten_items, i))
    h_five.append(getHitRatio(top_ten_items[:5], i))
    n_ten.append(getNDCG(top_ten_items, i))
    n_five.append(getNDCG(top_ten_items[:5], i))

In [None]:
print(np.average(h_ten))
print(np.average(h_five))
print(np.average(n_ten))
print(np.average(n_five))

# 4.2 NeuMF 2 features

In [None]:
num_users = len(merged["user_id"].unique())
num_items = len(merged["article_id"].unique())
num_categories = len(merged["category_int"].unique()) 
num_sub_categories = len(merged["subcategory_int"].unique())

dims = 20
def get_model_neumftwofeat(num_users, num_items, dims, dense_layers=[128, 64, 32, 8]):
    user_input = Input(shape=(1,), name="user")
    item_input = Input(shape=(1,), name="item")
    
    mf_user_emb = Embedding(output_dim=dims, 
                            input_dim=num_users, 
                            input_length=1, 
                            embeddings_initializer='he_normal', 
                            embeddings_regularizer=regularizers.l2(0.001),
                            name="mf_user_emb")(user_input)
    mf_item_emb = Embedding(output_dim=dims, 
                            input_dim=num_items, 
                            input_length=1, 
                            embeddings_initializer='he_normal', 
                            embeddings_regularizer=regularizers.l2(0.001),
                            name="mf_item_emb")(item_input)
    
    num_layers = len(dense_layers)
    mlp_user_emb = Embedding(output_dim=int(dense_layers[0] / 2), 
                             input_dim=num_users, 
                             input_length=1, 
                             embeddings_initializer='he_normal', 
                             embeddings_regularizer=regularizers.l2(0.001),
                             name="mlp_user_emb")(user_input)
    mlp_item_emb = Embedding(output_dim=int(dense_layers[0] / 2), 
                             input_dim=num_items, 
                             input_length=1, 
                             embeddings_initializer='he_normal', 
                            embeddings_regularizer=regularizers.l2(0.001),
                             name="mlp_user_item")(item_input)
    
    # Matrix factorization
    mf_user_vecs = Reshape([dims])(mf_user_emb)
    mf_item_vecs = Reshape([dims])(mf_item_emb)
    
    mf_vec = multiply([mf_user_vecs, mf_item_vecs])
    
    #MLP
    category_input = Input(shape=(1,), name="category_input")
    sub_category_input = Input(shape=(1,), name="subcategory_input")

    
    item_category_emb = Embedding(input_dim=num_categories, output_dim=int(dense_layers[0] / 2), name="category_emd", embeddings_regularizer=regularizers.l2(0.001))(category_input)
    item_subcategory_emb = Embedding(input_dim=num_sub_categories, output_dim=int(dense_layers[0] / 2),embeddings_regularizer=regularizers.l2(0.001), name="subcat_emb")(sub_category_input)

    
    
    item_category_flatten = Flatten()(item_category_emb)
    item_subcategory_flatten = Flatten()(item_subcategory_emb)

    user_flatten = Flatten()(mlp_user_emb)
    item_flatten = Flatten()(mlp_item_emb)
    
    
    wide_features = Concatenate()([item_category_flatten,user_flatten, item_flatten, item_subcategory_flatten])
    mlp_vector = Flatten()(wide_features)
    for num_dense in dense_layers:
        l = Dense(num_dense, activation="relu")
        mlp_vector = l(mlp_vector)
        mlp_vector = Dropout(0.2)(mlp_vector)
    

    
    mlp_vec = Concatenate()([mlp_user_emb, mlp_item_emb])
    mlp_vector = Flatten()(mlp_vec)
    
    y = Concatenate()([mf_vec, mlp_vector])
    y = Dense(1, activation="sigmoid", name="pred")(y)
    
    
    model = Model(inputs=[user_input, item_input,category_input, sub_category_input], outputs=y)
    model.compile(
        optimizer=Adam(0.01),
        loss="binary_crossentropy",
        metrics=["accuracy"],
    )
    return model

model_neumf_two_feat = get_model_neumftwofeat(num_users, num_items, dims)

In [None]:
###### Training ########
user_input = df_train.user_id.values
articles = df_train.article_id.values
category = df_train.article_category.values
sub_cat = df_train.article_sub_category.values
labels = df_train.labels.values
epochs = 3
for epoch in range(epochs):
    hist = model_neumf_two_feat.fit([user_input,articles,category,sub_cat], labels, validation_split=0.1, epochs=1, shuffle=True)

In [None]:
test_users = df_test.user_id.values
test_items = df_test.article_id.values
test_users, test_items = shuffle(test_users, test_items)
test_users, test_items = test_users, test_items
test_set = zip(test_users[:100], test_items[:100])
hits_ten,hits_five,ndcgs_ten,ndcgs_five = [], [], [], []
h_ten, h_five, n_ten, n_five = [], [], [], []
for (u,i) in tqdm(test_set):
    not_interacted_items = get_not_interacted(u, merged)
    users = np.array([u]*100)
    items = np.array([i]+not_interacted_items)
    categories = np.tile(np.array(article_to_category[i]), 100).reshape(-1,1)
    subcategories = np.tile(np.array(article_to_subcategory[i]), 100).reshape(-1,1)
    predictions = model_neumf_two_feat.predict([users, items,categories,subcategories])
    predicted_labels = np.squeeze(predictions)
    top_ten_items = [items[k] for k in np.argsort(predicted_labels)[::-1][0:10].tolist()]
    
    h_ten.append(getHitRatio(top_ten_items, i))
    h_five.append(getHitRatio(top_ten_items[:5], i))
    n_ten.append(getNDCG(top_ten_items, i))
    n_five.append(getNDCG(top_ten_items[:5], i))

In [None]:
print(np.average(h_ten))
print(np.average(h_five))
print(np.average(n_ten))
print(np.average(n_five))

# 4.3 NeuMF 3 features

In [None]:
num_users = len(merged["user_id"].unique())
num_items = len(merged["article_id"].unique())
num_categories = len(merged["category_int"].unique()) 
num_sub_categories = len(merged["subcategory_int"].unique())

dims = 20
def get_model_neumfthreefeat(num_users, num_items, dims, dense_layers=[128, 64, 32, 8]):
    user_input = Input(shape=(1,), name="user")
    item_input = Input(shape=(1,), name="item")
    
    mf_user_emb = Embedding(output_dim=dims, 
                            input_dim=num_users, 
                            input_length=1, 
                            embeddings_initializer='he_normal', 
                            embeddings_regularizer=regularizers.l2(0.001),
                            name="mf_user_emb")(user_input)
    mf_item_emb = Embedding(output_dim=dims, 
                            input_dim=num_items, 
                            input_length=1, 
                            embeddings_initializer='he_normal', 
                            embeddings_regularizer=regularizers.l2(0.001),
                            name="mf_item_emb")(item_input)
    
    num_layers = len(dense_layers)
    mlp_user_emb = Embedding(output_dim=int(dense_layers[0] / 2), 
                             input_dim=num_users, 
                             input_length=1, 
                             embeddings_initializer='he_normal', 
                             embeddings_regularizer=regularizers.l2(0.001),
                             name="mlp_user_emb")(user_input)
    mlp_item_emb = Embedding(output_dim=int(dense_layers[0] / 2), 
                             input_dim=num_items, 
                             input_length=1, 
                             embeddings_initializer='he_normal', 
                            embeddings_regularizer=regularizers.l2(0.001),
                             name="mlp_user_item")(item_input)
    
    # Matrix factorization
    mf_user_vecs = Reshape([dims])(mf_user_emb)
    mf_item_vecs = Reshape([dims])(mf_item_emb)
    
    mf_vec = multiply([mf_user_vecs, mf_item_vecs])
    
    #MLP
    category_input = Input(shape=(1,), name="category_input")
    sub_category_input = Input(shape=(1,), name="subcategory_input")
    title_input = Input(shape=(10,), name="title_input")
    
    item_category_emb = Embedding(input_dim=num_categories, output_dim=int(dense_layers[0] / 2), name="category_emd", embeddings_regularizer=regularizers.l2(0.001))(category_input)
    item_subcategory_emb = Embedding(input_dim=num_sub_categories, output_dim=int(dense_layers[0] / 2),embeddings_regularizer=regularizers.l2(0.001), name="subcat_emb")(sub_category_input)
    title_emb = Embedding(input_dim=num_words_title, output_dim=int(dense_layers[0] / 2),embeddings_regularizer=regularizers.l2(0.001), name="subcat_emb")(title_input)

    
    
    item_category_flatten = Flatten()(item_category_emb)
    item_subcategory_flatten = Flatten()(item_subcategory_emb)
    title_flatten = Flatten()(title_emb)

    user_flatten = Flatten()(mlp_user_emb)
    item_flatten = Flatten()(mlp_item_emb)
    
    
    wide_features = Concatenate()([item_category_flatten,user_flatten, item_flatten, item_subcategory_flatten, title_flatten])
    mlp_vector = Flatten()(wide_features)
    for num_dense in dense_layers:
        l = Dense(num_dense, activation="relu")
        mlp_vector = l(mlp_vector)
        mlp_vector = Dropout(0.2)(mlp_vector)
    

    
    mlp_vec = Concatenate()([mlp_user_emb, mlp_item_emb])
    mlp_vector = Flatten()(mlp_vec)
    
    y = Concatenate()([mf_vec, mlp_vector])
    y = Dense(1, activation="sigmoid", name="pred")(y)
    
    
    model = Model(inputs=[user_input, item_input,category_input, sub_category_input, title_input], outputs=y)
    model.compile(
        optimizer=Adam(0.01),
        loss="binary_crossentropy",
        metrics=["accuracy"],
    )
    return model

model_neumf_three_feat = get_model_neumfthreefeat(num_users, num_items, dims)

In [None]:
###### Training ########
user_input = df_train.user_id.values
articles = df_train.article_id.values
category = df_train.article_category.values
sub_cat = df_train.article_sub_category.values
title = np.array([np.array(t) for t in df_train.titles.values])
labels = df_train.labels.values
epochs = 3
for epoch in range(epochs):
    hist = model_neumf_three_feat.fit([user_input,articles,category,sub_cat, title], labels, validation_split=0.1, epochs=1, shuffle=True)

In [None]:
test_users = df_test.user_id.values
test_items = df_test.article_id.values
test_users, test_items = shuffle(test_users, test_items)
test_users, test_items = test_users, test_items
test_set = zip(test_users[:100], test_items[:100])
hits_ten,hits_five,ndcgs_ten,ndcgs_five = [], [], [], []
h_ten, h_five, n_ten, n_five = [], [], [], []
for (u,i) in tqdm(test_set):
    not_interacted_items = get_not_interacted(u, merged)
    users = np.array([u]*100)
    items = np.array([i]+not_interacted_items)
    categories = np.tile(np.array(article_to_category[i]), 100).reshape(-1,1)
    subcategories = np.tile(np.array(article_to_subcategory[i]), 100).reshape(-1,1)
    titles = np.tile(np.array(articleId_to_title[i]), 100).reshape(-1,10)
    predictions = model_neumf_three_feat.predict([users, items,categories,subcategories,titles])
    predicted_labels = np.squeeze(predictions)
    top_ten_items = [items[k] for k in np.argsort(predicted_labels)[::-1][0:10].tolist()]
    
    h_ten.append(getHitRatio(top_ten_items, i))
    h_five.append(getHitRatio(top_ten_items[:5], i))
    n_ten.append(getNDCG(top_ten_items, i))
    n_five.append(getNDCG(top_ten_items[:5], i))

In [None]:
print(np.average(h_ten))
print(np.average(h_five))
print(np.average(n_ten))
print(np.average(n_five))

# 4.4 NeuMF all features

In [None]:
num_users = len(merged["user_id"].unique())
num_items = len(merged["article_id"].unique())
num_categories = len(merged["category_int"].unique()) 
num_sub_categories = len(merged["subcategory_int"].unique())

dims = 20
def get_model_neumffourfeat(num_users, num_items, dims, dense_layers=[128, 64, 32, 8]):
    user_input = Input(shape=(1,), name="user")
    item_input = Input(shape=(1,), name="item")
    
    mf_user_emb = Embedding(output_dim=dims, 
                            input_dim=num_users, 
                            input_length=1, 
                            embeddings_initializer='he_normal', 
                            embeddings_regularizer=regularizers.l2(0.001),
                            name="mf_user_emb")(user_input)
    mf_item_emb = Embedding(output_dim=dims, 
                            input_dim=num_items, 
                            input_length=1, 
                            embeddings_initializer='he_normal', 
                            embeddings_regularizer=regularizers.l2(0.001),
                            name="mf_item_emb")(item_input)
    
    num_layers = len(dense_layers)
    mlp_user_emb = Embedding(output_dim=int(dense_layers[0] / 2), 
                             input_dim=num_users, 
                             input_length=1, 
                             embeddings_initializer='he_normal', 
                             embeddings_regularizer=regularizers.l2(0.001),
                             name="mlp_user_emb")(user_input)
    mlp_item_emb = Embedding(output_dim=int(dense_layers[0] / 2), 
                             input_dim=num_items, 
                             input_length=1, 
                             embeddings_initializer='he_normal', 
                            embeddings_regularizer=regularizers.l2(0.001),
                             name="mlp_user_item")(item_input)
    
    # Matrix factorization
    mf_user_vecs = Reshape([dims])(mf_user_emb)
    mf_item_vecs = Reshape([dims])(mf_item_emb)
    
    mf_vec = multiply([mf_user_vecs, mf_item_vecs])
    
    #MLP
    category_input = Input(shape=(1,), name="category_input")
    sub_category_input = Input(shape=(1,), name="subcategory_input")
    title_input = Input(shape=(10,), name="title_input")
    abstract_input = Input(shape=(10,), name="abstract")
    
    item_category_emb = Embedding(input_dim=num_categories, output_dim=int(dense_layers[0] / 2), name="category_emd", embeddings_regularizer=regularizers.l2(0.001))(category_input)
    item_subcategory_emb = Embedding(input_dim=num_sub_categories, output_dim=int(dense_layers[0] / 2),embeddings_regularizer=regularizers.l2(0.001), name="subcat_emb")(sub_category_input)
    title_emb = Embedding(input_dim=num_words_title, output_dim=int(dense_layers[0] / 2),embeddings_regularizer=regularizers.l2(0.001), name="subcat_emb")(title_input)
    abstract_emb = Embedding(input_dim=num_words_abstract, output_dim=int(dense_layers[0] / 2),embeddings_regularizer=regularizers.l2(0.001), name="subcat_emb")(abstract_input)

    
    
    item_category_flatten = Flatten()(item_category_emb)
    item_subcategory_flatten = Flatten()(item_subcategory_emb)
    title_flatten = Flatten()(title_emb)
    abs_flatten = Flatten()(abstract_emb)

    user_flatten = Flatten()(mlp_user_emb)
    item_flatten = Flatten()(mlp_item_emb)
    
    
    wide_features = Concatenate()([item_category_flatten,
                                   user_flatten, 
                                   item_flatten, 
                                   item_subcategory_flatten, 
                                   title_flatten, 
                                   abs_flatten])
    mlp_vector = Flatten()(wide_features)
    for num_dense in dense_layers:
        l = Dense(num_dense, activation="relu")
        mlp_vector = l(mlp_vector)
        mlp_vector = Dropout(0.2)(mlp_vector)
    

    
    mlp_vec = Concatenate()([mlp_user_emb, mlp_item_emb])
    mlp_vector = Flatten()(mlp_vec)
    
    y = Concatenate()([mf_vec, mlp_vector])
    y = Dense(1, activation="sigmoid", name="pred")(y)
    
    
    model = Model(inputs=[user_input, item_input,category_input, sub_category_input, title_input, abstract_input], outputs=y)
    model.compile(
        optimizer=Adam(0.01),
        loss="binary_crossentropy",
        metrics=["accuracy"],
    )
    return model

model_neumf_four_feat = get_model_neumffourfeat(num_users, num_items, dims)

In [None]:
###### Training ########
user_input = df_train.user_id.values
articles = df_train.article_id.values
category = df_train.article_category.values
sub_cat = df_train.article_sub_category.values
title = np.array([np.array(t) for t in df_train.titles.values])
abstract = np.array([np.array(a) for a in df_train.abstract.values])

labels = df_train.labels.values
epochs = 3
for epoch in range(epochs):
    hist = model_neumf_four_feat.fit([user_input,articles,category,sub_cat, title,abstract], labels, validation_split=0.1, epochs=1, shuffle=True)

In [None]:
test_users = df_test.user_id.values
test_items = df_test.article_id.values
test_users, test_items = shuffle(test_users, test_items)
test_users, test_items = test_users, test_items
test_set = zip(test_users[:100], test_items[:100])
hits_ten,hits_five,ndcgs_ten,ndcgs_five = [], [], [], []
h_ten, h_five, n_ten, n_five = [], [], [], []
for (u,i) in tqdm(test_set):
    not_interacted_items = get_not_interacted(u, merged)
    users = np.array([u]*100)
    items = np.array([i]+not_interacted_items)
    categories = np.tile(np.array(article_to_category[i]), 100).reshape(-1,1)
    subcategories = np.tile(np.array(article_to_subcategory[i]), 100).reshape(-1,1)
    titles = np.tile(np.array(articleId_to_title[i]), 100).reshape(-1,10)
    abstracts = np.tile(np.array(article_to_abstract[i]), 100).reshape(-1,10)
    predictions = model_neumf_four_feat.predict([users, items,categories,subcategories,titles,abstracts])
    predicted_labels = np.squeeze(predictions)
    top_ten_items = [items[k] for k in np.argsort(predicted_labels)[::-1][0:10].tolist()]
    
    h_ten.append(getHitRatio(top_ten_items, i))
    h_five.append(getHitRatio(top_ten_items[:5], i))
    n_ten.append(getNDCG(top_ten_items, i))
    n_five.append(getNDCG(top_ten_items[:5], i))

In [None]:
print(np.average(h_ten))
print(np.average(h_five))
print(np.average(n_ten))
print(np.average(n_five))

# 5.1 ENSUS 

In [None]:
#@tf.autograph.experimental.do_not_convert
def get_model(num_users, num_items, dims,num_categories,num_sub_categories, dense_layers=[128, 64, 32, 8]):
    #User features
    user_history = Input(shape=(10,), name="user")
    user_profile_input = Input(shape=(6,), name="profile")
    #item features
    item_input = Input(shape=(1,), name="item")
    item_category = Input(shape=(1,), name="category")
    item_subcategory = Input(shape=(1,), name="subcategory")
    
    # User emb
    click_history_emb = Embedding(output_dim=dims, 
                                  input_dim=num_items+1, 
                                  input_length=10, 
                                  embeddings_initializer='he_normal', 
                                embeddings_regularizer=regularizers.l2(0.001),
                                  name="mf_user_emb")(user_history)
    profile_emb = Embedding(output_dim=dims, 
                            input_dim=num_sub_categories,
                            input_length=6, 
                            embeddings_initializer='he_normal', 
                                embeddings_regularizer=regularizers.l2(0.001),
                            name="mf_profile_emb")(user_profile_input)
    
    # Item emb
    item_emb = Embedding(output_dim=dims, 
                         input_dim=num_items+1, 
                         input_length=1, 
                         embeddings_initializer='he_normal', 
                                embeddings_regularizer=regularizers.l2(0.001),
                         name="mf_item_emb")(item_input)
    category_emb = Embedding(output_dim=dims, 
                             input_dim=num_categories, 
                             input_length=1, 
                             embeddings_initializer='he_normal', 
                                embeddings_regularizer=regularizers.l2(0.001),
                             name="cat_emb")(item_category)

    ### Wide
    #wide_history = Flatten()(click_history_emb)
    #wide_item = Flatten()(item_input)
    wide = Concatenate(axis=1)([click_history_emb, item_emb])
    wide = Flatten()(wide)
    for n in dense_layers:
        l = Dense(n, activation="relu")
        wide = l(wide)
        d = Dropout(0.7)
        wide = d(wide)
    y_wide = Dense(2)(wide)
    
    ### Deep
    deep_features = category_emb
    x_deep = LSTM(40)(deep_features)
    x_deep = Dropout(0.5)(x_deep)
    x_deep = BatchNormalization(axis=1)(x_deep)
    
    print(x_deep.shape)
    print(y_wide.shape)
    
    final = Concatenate()([x_deep, y_wide])
    final = BatchNormalization(axis=1)(final)
   
    y = Dense(1, activation="sigmoid")(final)
    
    
    model = Model(inputs=[user_history, item_input, item_category], outputs=y)
    model.compile(
        optimizer=Adam(0.001),
        loss="binary_crossentropy",
        metrics=["accuracy"],
    )
    return model

model_final = get_model(num_users, num_items, dims, num_categories,num_sub_categories)

In [None]:
###### Training ########
user_input = df_train.user_id.values
articles = df_train.article_id.values
category = df_train.article_category.values
click_history = df_train.iloc[:,13:].values
labels = df_train.labels.values
epochs = 3
for epoch in range(epochs):
    hist = model_final.fit([click_history,articles,category], labels, validation_split=0.1, epochs=1, shuffle=True)

In [None]:
test_users = df_test.user_id.values
test_items = df_test.article_id.values
test_users, test_items = shuffle(test_users, test_items)
test_users, test_items = test_users, test_items
test_set = zip(test_users[:100], test_items[:100])
hits_ten,hits_five,ndcgs_ten,ndcgs_five = [], [], [], []
h_ten, h_five, n_ten, n_five = [], [], [], []
for (u,i) in tqdm(test_set):
    not_interacted_items = get_not_interacted(u, merged)
    users = np.array([u]*100)
    items = np.array([i]+not_interacted_items)
    categories = np.tile(np.array(article_to_category[i]), 100).reshape(-1,1)
    click_history = np.tile(userid_to_article_history[u], 100).reshape(-1, 10)
    predictions = model_final.predict([click_history, items,categories])
    predicted_labels = np.squeeze(predictions)
    top_ten_items = [items[k] for k in np.argsort(predicted_labels)[::-1][0:10].tolist()]
    
    h_ten.append(getHitRatio(top_ten_items, i))
    h_five.append(getHitRatio(top_ten_items[:5], i))
    n_ten.append(getNDCG(top_ten_items, i))
    n_five.append(getNDCG(top_ten_items[:5], i))

In [None]:
print(np.average(h_ten))
print(np.average(h_five))
print(np.average(n_ten))
print(np.average(n_five))

# 5.2 Ensus features

In [None]:
#@tf.autograph.experimental.do_not_convert
def get_model(num_users, num_items, dims,num_categories,num_sub_categories, dense_layers=[128, 64, 32, 8]):
    #User features
    user_history = Input(shape=(10,), name="user")
    user_profile_input = Input(shape=(6,), name="profile")
    #item features
    item_input = Input(shape=(1,), name="item")
    item_category = Input(shape=(1,), name="category")
    item_subcategory = Input(shape=(1,), name="subcategory")
    
    # User emb
    click_history_emb = Embedding(output_dim=dims, 
                                  input_dim=num_items+1, 
                                  input_length=10, 
                                  embeddings_initializer='he_normal', 
                                embeddings_regularizer=regularizers.l2(0.001),
                                  name="mf_user_emb")(user_history)
    profile_emb = Embedding(output_dim=dims, 
                            input_dim=num_sub_categories, 
                            input_length=6,
                            embeddings_initializer='he_normal', 
                                embeddings_regularizer=regularizers.l2(0.001),
                            name="mf_profile_emb")(user_profile_input)
    
    # Item emb
    item_emb = Embedding(output_dim=dims, 
                         input_dim=num_items+1, 
                         input_length=1, 
                         embeddings_initializer='he_normal', 
                                embeddings_regularizer=regularizers.l2(0.001),
                         name="mf_item_emb")(item_input)
    category_emb = Embedding(output_dim=dims, 
                             input_dim=num_categories, 
                             input_length=1, 
                             embeddings_initializer='he_normal', 
                                embeddings_regularizer=regularizers.l2(0.001),
                             name="cat_emb")(item_category)
    subcategory_emb = Embedding(output_dim=dims, 
                                input_dim=num_sub_categories, 
                                input_length=1, 
                                embeddings_initializer='he_normal', 
                                embeddings_regularizer=regularizers.l2(0.001),
                                name="subcat_emb")(item_subcategory)

    ### Wide
    #wide_history = Flatten()(click_history_emb)
    #wide_item = Flatten()(item_input)
    wide = Concatenate(axis=1)([click_history_emb, item_emb])
    wide = Flatten()(wide)
    for n in dense_layers:
        l = Dense(n, activation="relu")
        wide = l(wide)
        d = Dropout(0.7)
        wide = d(wide)
    y_wide = Dense(2)(wide)
    
    ### Deep
    deep_features = Concatenate(axis=1)([category_emb, subcategory_emb])
    x_deep = LSTM(40)(deep_features)
    
    print(x_deep.shape)
    print(y_wide.shape)
    
    final = Concatenate()([x_deep, y_wide])
    final = BatchNormalization(axis=1)(final)
   
    y = Dense(1, activation="sigmoid")(final)
    
    
    model = Model(inputs=[user_history, item_input, item_category,item_subcategory], outputs=y)
    model.compile(
        optimizer=Adam(0.001),
        loss="binary_crossentropy",
        metrics=["accuracy"],
    )
    return model

model_final = get_model(num_users, num_items, dims, num_categories,num_sub_categories)

In [None]:
###### Training ########
user_input = df_train.user_id.values
articles = df_train.article_id.values
category = df_train.article_category.values
subcategory = df_train.article_sub_category.values
click_history = df_train.iloc[:,13:].values
labels = df_train.labels.values
epochs = 3
for epoch in range(epochs):
    hist = model_final.fit([click_history,articles,category,subcategory], labels, validation_split=0.1, epochs=1, shuffle=True)

In [None]:
test_users = df_test.user_id.values
test_items = df_test.article_id.values
test_users, test_items = shuffle(test_users, test_items)
test_users, test_items = test_users, test_items
test_set = zip(test_users[:100], test_items[:100])
hits_ten,hits_five,ndcgs_ten,ndcgs_five = [], [], [], []
h_ten, h_five, n_ten, n_five = [], [], [], []
for (u,i) in tqdm(test_set):
    not_interacted_items = get_not_interacted(u, merged)
    users = np.array([u]*100)
    items = np.array([i]+not_interacted_items)
    categories = np.tile(np.array(article_to_category[i]), 100).reshape(-1,1)
    subcategories = np.tile(np.array(article_to_subcategory[i]), 100).reshape(-1,1)

    click_history = np.tile(userid_to_article_history[u], 100).reshape(-1, 10)
    predictions = model_final.predict([click_history, items,categories,subcategories])
    predicted_labels = np.squeeze(predictions)
    top_ten_items = [items[k] for k in np.argsort(predicted_labels)[::-1][0:10].tolist()]
    
    h_ten.append(getHitRatio(top_ten_items, i))
    h_five.append(getHitRatio(top_ten_items[:5], i))
    n_ten.append(getNDCG(top_ten_items, i))
    n_five.append(getNDCG(top_ten_items[:5], i))

In [None]:
print(np.average(h_ten))
print(np.average(h_five))
print(np.average(n_ten))
print(np.average(n_five))

# 5. 3

In [None]:
#@tf.autograph.experimental.do_not_convert
def get_model(num_users, num_items, dims,num_categories,num_sub_categories, dense_layers=[128, 64, 32, 8]):
    #User features
    user_history = Input(shape=(10,), name="user")
    user_profile_input = Input(shape=(6,), name="profile")
    #item features
    item_input = Input(shape=(1,), name="item")
    item_category = Input(shape=(1,), name="category")
    item_subcategory = Input(shape=(1,), name="subcategory")
    
    # User emb
    click_history_emb = Embedding(output_dim=dims, 
                                  input_dim=num_items+1, 
                                  input_length=10, 
                                  embeddings_initializer='he_normal', 
                                embeddings_regularizer=regularizers.l2(0.001),
                                  name="mf_user_emb")(user_history)
    profile_emb = Embedding(output_dim=dims, 
                            input_dim=num_sub_categories, 
                            input_length=6, 
                            embeddings_initializer='he_normal', 
                                embeddings_regularizer=regularizers.l2(0.001),
                            name="mf_profile_emb")(user_profile_input)
    
    # Item emb
    item_emb = Embedding(output_dim=dims, 
                         input_dim=num_items+1, 
                         input_length=1, 
                         embeddings_initializer='he_normal', 
                                embeddings_regularizer=regularizers.l2(0.001),
                         name="mf_item_emb")(item_input)
    category_emb = Embedding(output_dim=dims, 
                             input_dim=num_categories, 
                             input_length=1, 
                             embeddings_initializer='he_normal', 
                                embeddings_regularizer=regularizers.l2(0.001),
                             name="cat_emb")(item_category)
    subcategory_emb = Embedding(output_dim=dims, 
                                embeddings_initializer='he_normal', 
                                embeddings_regularizer=regularizers.l2(0.001),
                                input_dim=num_sub_categories, input_length=1, name="subcat_emb")(item_subcategory)

    ### Wide
    #wide_history = Flatten()(click_history_emb)
    #wide_item = Flatten()(item_input)
    wide = Concatenate(axis=1)([click_history_emb, item_emb])
    wide = Flatten()(wide)
    for n in dense_layers:
        l = Dense(n, activation="relu")
        wide = l(wide)
        d = Dropout(0.7)
        wide = d(wide)
    y_wide = Dense(2)(wide)
    
    ### Deep
    deep_features = Concatenate(axis=1)([category_emb, subcategory_emb,profile_emb])
    x_deep = LSTM(40)(deep_features)
    
    print(x_deep.shape)
    print(y_wide.shape)
    
    final = Concatenate()([x_deep, y_wide])
    final = BatchNormalization(axis=1)(final)
   
    y = Dense(1, activation="sigmoid")(final)
    
    
    model = Model(inputs=[user_history, item_input, item_category,item_subcategory, user_profile_input], outputs=y)
    model.compile(
        optimizer=Adam(0.001),
        loss="binary_crossentropy",
        metrics=["accuracy"],
    )
    return model

model_final = get_model(num_users, num_items, dims, num_categories,num_sub_categories)

In [None]:
###### Training ########
user_input = df_train.user_id.values
articles = df_train.article_id.values
category = df_train.article_category.values
subcategory = df_train.article_sub_category.values
click_history = df_train.iloc[:,13:].values
profile = df_train.iloc[:,1:7].values
labels = df_train.labels.values
epochs = 3
for epoch in range(epochs):
    hist = model_final.fit([click_history,articles,category,subcategory,profile], labels, validation_split=0.1, epochs=1, shuffle=True)

In [None]:
test_users = df_test.user_id.values
test_items = df_test.article_id.values
test_users, test_items = shuffle(test_users, test_items)
test_users, test_items = test_users, test_items
test_set = zip(test_users[:100], test_items[:100])
hits_ten,hits_five,ndcgs_ten,ndcgs_five = [], [], [], []
h_ten, h_five, n_ten, n_five = [], [], [], []
for (u,i) in tqdm(test_set):
    not_interacted_items = get_not_interacted(u, merged)
    users = np.array([u]*100)
    items = np.array([i]+not_interacted_items)
    categories = np.tile(np.array(article_to_category[i]), 100).reshape(-1,1)
    subcategories = np.tile(np.array(article_to_subcategory[i]), 100).reshape(-1,1)
    profile = np.tile(np.array(userid_to_profile[u]), 100).reshape(-1, 6)
    click_history = np.tile(userid_to_article_history[u], 100).reshape(-1, 10)
    predictions = model_final.predict([click_history, items,categories,subcategories,profile])
    predicted_labels = np.squeeze(predictions)
    top_ten_items = [items[k] for k in np.argsort(predicted_labels)[::-1][0:10].tolist()]
    
    h_ten.append(getHitRatio(top_ten_items, i))
    h_five.append(getHitRatio(top_ten_items[:5], i))
    n_ten.append(getNDCG(top_ten_items, i))
    n_five.append(getNDCG(top_ten_items[:5], i))

In [None]:
print(np.average(h_ten))
print(np.average(h_five))
print(np.average(n_ten))
print(np.average(n_five))

# 5.4

In [None]:
#@tf.autograph.experimental.do_not_convert
def get_model(num_users, num_items, dims,num_categories,num_sub_categories, dense_layers=[128, 64, 32, 8]):
    #User features
    user_history = Input(shape=(10,), name="user")
    user_profile_input = Input(shape=(6,), name="profile")
    #item features
    item_input = Input(shape=(1,), name="item")
    item_category = Input(shape=(1,), name="category")
    item_subcategory = Input(shape=(1,), name="subcategory")
    item_title = Input(shape=(10,), name="title")
    
    # User emb
    click_history_emb = Embedding(output_dim=dims,embeddings_initializer='he_normal', 
                                embeddings_regularizer=regularizers.l2(0.001), input_dim=num_items+1, input_length=10, name="mf_user_emb")(user_history)
    profile_emb = Embedding(output_dim=dims, embeddings_initializer='he_normal', 
                                embeddings_regularizer=regularizers.l2(0.001),input_dim=num_sub_categories, input_length=6, name="mf_profile_emb")(user_profile_input)
    
    # Item emb
    item_emb = Embedding(output_dim=dims, embeddings_initializer='he_normal', 
                                embeddings_regularizer=regularizers.l2(0.001),input_dim=num_items+1, input_length=1, name="mf_item_emb")(item_input)
    category_emb = Embedding(output_dim=dims,embeddings_initializer='he_normal', 
                                embeddings_regularizer=regularizers.l2(0.001), input_dim=num_categories, input_length=1, name="cat_emb")(item_category)
    subcategory_emb = Embedding(output_dim=dims,embeddings_initializer='he_normal', 
                                embeddings_regularizer=regularizers.l2(0.001), input_dim=num_sub_categories, input_length=1, name="subcat_emb")(item_subcategory)
    title_emb = Embedding(output_dim=dims, embeddings_initializer='he_normal', 
                                embeddings_regularizer=regularizers.l2(0.001),input_dim=num_words_title, input_length=1, name="title_em")(item_title)

    ### Wide
    #wide_history = Flatten()(click_history_emb)
    #wide_item = Flatten()(item_input)
    wide = Concatenate(axis=1)([click_history_emb, item_emb, title_emb])
    wide = Flatten()(wide)
    for n in dense_layers:
        l = Dense(n, activation="relu")
        wide = l(wide)
        d = Dropout(0.7)
        wide = d(wide)
    y_wide = Dense(2)(wide)
    
    ### Deep
    deep_features = Concatenate(axis=1)([category_emb, subcategory_emb,profile_emb])
    x_deep = LSTM(40)(deep_features)
    
    print(x_deep.shape)
    print(y_wide.shape)
    
    final = Concatenate()([x_deep, y_wide])
    final = BatchNormalization(axis=1)(final)
   
    y = Dense(1, activation="sigmoid")(final)
    
    
    model = Model(inputs=[user_history, item_input, item_category,item_subcategory, user_profile_input,item_title], outputs=y)
    model.compile(
        optimizer=Adam(0.001),
        loss="binary_crossentropy",
        metrics=["accuracy"],
    )
    return model

model_final = get_model(num_users, num_items, dims, num_categories,num_sub_categories)

In [None]:
###### Training ########
user_input = df_train.user_id.values
articles = df_train.article_id.values
category = df_train.article_category.values
subcategory = df_train.article_sub_category.values
click_history = df_train.iloc[:,13:].values
profile = df_train.iloc[:,1:7].values
labels = df_train.labels.values

titles = np.array([np.array(t) for t in df_train.titles.values])
epochs = 3
for epoch in range(epochs):
    hist = model_final.fit([click_history,articles,category,subcategory,profile, titles], labels, validation_split=0.1, epochs=1, shuffle=True)

In [None]:
test_users = df_test.user_id.values
test_items = df_test.article_id.values
test_users, test_items = shuffle(test_users, test_items)
test_users, test_items = test_users, test_items
test_set = zip(test_users[:100], test_items[:100])
hits_ten,hits_five,ndcgs_ten,ndcgs_five = [], [], [], []
h_ten, h_five, n_ten, n_five = [], [], [], []
for (u,i) in tqdm(test_set):
    not_interacted_items = get_not_interacted(u, merged)
    users = np.array([u]*100)
    items = np.array([i]+not_interacted_items)
    categories = np.tile(np.array(article_to_category[i]), 100).reshape(-1,1)
    subcategories = np.tile(np.array(article_to_subcategory[i]), 100).reshape(-1,1)
    profile = np.tile(np.array(userid_to_profile[u]), 100).reshape(-1, 6)
    click_history = np.tile(userid_to_article_history[u], 100).reshape(-1, 10)
    titles = np.tile(np.array(articleId_to_title[i]), 100).reshape(-1,10)

    predictions = model_final.predict([click_history, items,categories,subcategories,profile,titles])
    predicted_labels = np.squeeze(predictions)
    top_ten_items = [items[k] for k in np.argsort(predicted_labels)[::-1][0:10].tolist()]
    
    h_ten.append(getHitRatio(top_ten_items, i))
    h_five.append(getHitRatio(top_ten_items[:5], i))
    n_ten.append(getNDCG(top_ten_items, i))
    n_five.append(getNDCG(top_ten_items[:5], i))

In [None]:
print(np.average(h_ten))
print(np.average(h_five))
print(np.average(n_ten))
print(np.average(n_five))

# 5.5

In [None]:
#@tf.autograph.experimental.do_not_convert
def get_model(num_users, num_items, dims,num_categories,num_sub_categories, dense_layers=[128, 64, 32, 8]):
    #User features
    user_history = Input(shape=(10,), name="user")
    user_profile_input = Input(shape=(6,), name="profile")
    #item features
    item_input = Input(shape=(1,), name="item")
    item_category = Input(shape=(1,), name="category")
    item_subcategory = Input(shape=(1,), name="subcategory")
    item_title = Input(shape=(10,), name="title")
    abstract = Input(shape=(10,), name="abstract")
    
    # User emb
    click_history_emb = Embedding(output_dim=dims,embeddings_initializer='he_normal', 
                                embeddings_regularizer=regularizers.l2(0.001), input_dim=num_items+1, input_length=10, name="mf_user_emb")(user_history)
    profile_emb = Embedding(output_dim=dims, input_dim=num_sub_categories, input_length=6, name="mf_profile_emb")(user_profile_input)
    
    # Item emb
    item_emb = Embedding(output_dim=dims,embeddings_initializer='he_normal', 
                                embeddings_regularizer=regularizers.l2(0.001), input_dim=num_items+1, input_length=1, name="mf_item_emb")(item_input)
    category_emb = Embedding(output_dim=dims,embeddings_initializer='he_normal', 
                                embeddings_regularizer=regularizers.l2(0.001), input_dim=num_categories, input_length=1, name="cat_emb")(item_category)
    subcategory_emb = Embedding(output_dim=dims,embeddings_initializer='he_normal', 
                                embeddings_regularizer=regularizers.l2(0.001), input_dim=num_sub_categories, input_length=1, name="subcat_emb")(item_subcategory)
    title_emb = Embedding(output_dim=dims,embeddings_initializer='he_normal', 
                                embeddings_regularizer=regularizers.l2(0.001), input_dim=num_words_title, input_length=1, name="title_em")(item_title)
    abstract_emb = Embedding(output_dim=dims, embeddings_initializer='he_normal', 
                                embeddings_regularizer=regularizers.l2(0.001),input_dim=num_words_abstract, input_length=1, name="abstract_em")(abstract)

    ### Wide
    #wide_history = Flatten()(click_history_emb)
    #wide_item = Flatten()(item_input)
    wide = Concatenate(axis=1)([click_history_emb, item_emb, title_emb])
    wide = Flatten()(wide)
    for n in dense_layers:
        l = Dense(n, activation="relu")
        wide = l(wide)
        d = Dropout(0.7)
        wide = d(wide)
    y_wide = Dense(2)(wide)
    
    ### Deep
    deep_features = Concatenate(axis=1)([category_emb, subcategory_emb,profile_emb])
    x_deep = LSTM(40)(deep_features)
    
    print(x_deep.shape)
    print(y_wide.shape)
    
    final = Concatenate()([x_deep, y_wide])
    final = BatchNormalization(axis=1)(final)
   
    y = Dense(1, activation="sigmoid")(final)
    
    
    model = Model(inputs=[user_history, item_input, item_category,item_subcategory, user_profile_input,item_title,abstract], outputs=y)
    model.compile(
        optimizer=Adam(0.001),
        loss="binary_crossentropy",
        metrics=["accuracy"],
    )
    return model

model_final = get_model(num_users, num_items, dims, num_categories,num_sub_categories)

In [None]:
#abstract = np.array([np.array(a) for a in df_train.abstract.values])


In [None]:
###### Training ########
user_input = df_train.user_id.values
articles = df_train.article_id.values
category = df_train.article_category.values
subcategory = df_train.article_sub_category.values
click_history = df_train.iloc[:,13:].values
profile = df_train.iloc[:,1:7].values
labels = df_train.labels.values

titles = np.array([np.array(t) for t in df_train.titles.values])
abstract = np.array([np.array(a) for a in df_train.abstract.values])

epochs = 3
for epoch in range(epochs):
    hist = model_final.fit([click_history,articles,category,subcategory,profile, titles,abstract], labels, validation_split=0.1, epochs=1, shuffle=True)

In [None]:
test_users = df_test.user_id.values
test_items = df_test.article_id.values
test_users, test_items = shuffle(test_users, test_items)
test_users, test_items = test_users, test_items
test_set = zip(test_users[:100], test_items[:100])
hits_ten,hits_five,ndcgs_ten,ndcgs_five = [], [], [], []
h_ten, h_five, n_ten, n_five = [], [], [], []
for (u,i) in tqdm(test_set):
    not_interacted_items = get_not_interacted(u, merged)
    users = np.array([u]*100)
    items = np.array([i]+not_interacted_items)
    categories = np.tile(np.array(article_to_category[i]), 100).reshape(-1,1)
    subcategories = np.tile(np.array(article_to_subcategory[i]), 100).reshape(-1,1)
    profile = np.tile(np.array(userid_to_profile[u]), 100).reshape(-1, 6)
    click_history = np.tile(userid_to_article_history[u], 100).reshape(-1, 10)
    titles = np.tile(np.array(articleId_to_title[i]), 100).reshape(-1,10)
    abstracts = np.tile(np.array(article_to_abstract[i]), 100).reshape(-1,10)

    predictions = model_final.predict([click_history, items,categories,subcategories,profile,titles,abstracts])
    predicted_labels = np.squeeze(predictions)
    top_ten_items = [items[k] for k in np.argsort(predicted_labels)[::-1][0:10].tolist()]
    
    h_ten.append(getHitRatio(top_ten_items, i))
    h_five.append(getHitRatio(top_ten_items[:5], i))
    n_ten.append(getNDCG(top_ten_items, i))
    n_five.append(getNDCG(top_ten_items[:5], i))

In [None]:
print(np.average(h_ten))
print(np.average(h_five))
print(np.average(n_ten))
print(np.average(n_five))