In [1]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import tensorflow as tf
from tqdm import tqdm
import re
import scipy
import tensorflow
from tensorflow.keras.layers import Input,Flatten, Embedding, Reshape, Multiply, Dropout, Dense, Concatenate, GlobalAveragePooling1D
from tensorflow.keras.layers import Layer, SpatialDropout1D, GlobalMaxPooling1D, Bidirectional, GRU, LSTM
from tensorflow.keras.layers import Dot, TimeDistributed, BatchNormalization, Add, Multiply
from tensorflow.keras import initializers, regularizers, constraints
from tensorflow.keras import Model
from tensorflow.keras.optimizers import Adam
#import keras.backend as K
from sklearn.utils import shuffle
import seaborn as sns
import math
import shap

In [2]:
PATH = "../data/mind_small/"
news = pd.read_csv(PATH + "news.tsv",header=None, sep="\t")
behaviors = pd.read_csv(PATH + "behaviors.tsv", header=None, sep="\t")
news.columns = ["news_id", "category", "sub_category", "title", "abstract", "url", "title_entities", "abstract_entities"]
behaviors.columns = ["idx", "user_id", "time", "history", "impressions"]
behaviors = behaviors.drop_duplicates(["user_id", "history"]) 
behaviors.dropna(subset=["user_id", "history"], inplace=True)

In [3]:
sessions = {}
for idx, row in behaviors.iterrows():
    sessions[row["user_id"]] = row["history"].split(" ")

users = []
clicks = []
for k, v in sessions.items():
    for elem in v:
        users.append(k)
        clicks.append(elem)

tuples = list(zip(users, clicks))
interactions = pd.DataFrame(tuples, columns=["user", "news_id"])
interactions = interactions[:10000]

In [4]:
merged = interactions.merge(news, on=["news_id"])
merged.head()

Unnamed: 0,user,news_id,category,sub_category,title,abstract,url,title_entities,abstract_entities
0,U13740,N55189,tv,tvnews,"'Wheel Of Fortune' Guest Delivers Hilarious, O...","We'd like to solve the puzzle, Pat: Blair Davi...",https://assets.msn.com/labs/mind/AAIORni.html,[],"[{""Label"": ""Pat Sajak"", ""Type"": ""P"", ""Wikidata..."
1,U10045,N55189,tv,tvnews,"'Wheel Of Fortune' Guest Delivers Hilarious, O...","We'd like to solve the puzzle, Pat: Blair Davi...",https://assets.msn.com/labs/mind/AAIORni.html,[],"[{""Label"": ""Pat Sajak"", ""Type"": ""P"", ""Wikidata..."
2,U85394,N55189,tv,tvnews,"'Wheel Of Fortune' Guest Delivers Hilarious, O...","We'd like to solve the puzzle, Pat: Blair Davi...",https://assets.msn.com/labs/mind/AAIORni.html,[],"[{""Label"": ""Pat Sajak"", ""Type"": ""P"", ""Wikidata..."
3,U78244,N55189,tv,tvnews,"'Wheel Of Fortune' Guest Delivers Hilarious, O...","We'd like to solve the puzzle, Pat: Blair Davi...",https://assets.msn.com/labs/mind/AAIORni.html,[],"[{""Label"": ""Pat Sajak"", ""Type"": ""P"", ""Wikidata..."
4,U27024,N55189,tv,tvnews,"'Wheel Of Fortune' Guest Delivers Hilarious, O...","We'd like to solve the puzzle, Pat: Blair Davi...",https://assets.msn.com/labs/mind/AAIORni.html,[],"[{""Label"": ""Pat Sajak"", ""Type"": ""P"", ""Wikidata..."


In [5]:
print(len(merged))
merged = merged.drop_duplicates()
print(len(merged))

10000
9861


# 1. Preprocessing

In [6]:
# remove users which have fewer than 5 interacations
print("Len before removal: ",len(merged))
_keys = merged["user"].value_counts()[merged["user"].value_counts() > 5].keys()
merged = merged[merged["user"].isin(_keys)]
print("Len after removal: ",len(merged))


user_enc = LabelEncoder()
article_enc = LabelEncoder()
merged["user_id"] = user_enc.fit_transform(merged["user"].values)
merged["article_id"] = article_enc.fit_transform(merged["news_id"].values)




Len before removal:  9861
Len after removal:  9728


In [7]:
import nltk
from nltk.corpus import stopwords
# Helper functions
def _removeNonAscii(s):
    return "".join(i for i in s if  ord(i)<128)

def make_lower_case(text):
    return text.lower()

def remove_stop_words(text):
    text = text.split()
    stops = set(stopwords.words("english"))
    text = [w for w in text if not w in stops]
    text = " ".join(text)
    return text

def remove_html(text):
    html_pattern = re.compile('<.*?>')
    return html_pattern.sub(r'', text)

def remove_punctuation(text):
    text = re.sub(r'[^\w\s]', '', text)
    return text

def text_to_list(text):
    text = text.split(" ")
    return text

In [8]:
def clean_title(df):
    df["title_cleaned"] = df.title.apply(func = make_lower_case)
    df["title_cleaned"] = df.title_cleaned.apply(func = remove_stop_words)
    df["title_cleaned"] = df.title_cleaned.apply(func = remove_punctuation)
    return df
def hyphen_to_underline(category):
    """
    Convert hyphen to underline for the subcategories. So that Tfidf works correctly
    """
    return category.replace("-","_")
merged = clean_title(merged)
merged["subcategory_cleaned"] = merged["sub_category"].apply(func = hyphen_to_underline)

# Alternative to tf-idf

# End alternative to tfidf

In [9]:
vectorizer = TfidfVectorizer(analyzer="word", tokenizer=str.split)
item_ids = merged["article_id"].unique().tolist()
tfidf_matrix = vectorizer.fit_transform(merged["subcategory_cleaned"])
tfidf_feature_names = vectorizer.get_feature_names()
tfidf_matrix

<9728x164 sparse matrix of type '<class 'numpy.float64'>'
	with 9728 stored elements in Compressed Sparse Row format>

In [10]:
item_ids = merged["article_id"].tolist()

def get_item_profile(item_id):
    """
    item_id: the news article id
    Return: an array of each n-gram in the item article. 
        with their n-gram id in tfidf_feature_names and weight.
    """
    idx = item_ids.index(item_id) # returns the index to the item id
    item_profile = tfidf_matrix[idx:idx+1]
    return item_profile
    
def get_item_profiles(ids):
    #print(ids)
    item_profiles_list = [get_item_profile(x) for x in ids]
    item_profiles = scipy.sparse.vstack(item_profiles_list)
    return item_profiles

def build_user_profile(person_id):
    interactions = merged[merged["user_id"] == person_id]["article_id"].values # gets all articles
    user_item_profiles = get_item_profiles(interactions)
    user_item_profiles = np.sum(user_item_profiles, axis=0)
    user_profile_norm = sklearn.preprocessing.normalize(user_item_profiles)
    return user_item_profiles
    
#t = build_user_profile(1)

In [11]:
def calculate_user_profiles(unique_user_ids):
    user_profiles = {}
    for idx in tqdm(unique_user_ids):
        token_relevance = build_user_profile(idx).tolist()[0]
        zipped = zip(tfidf_feature_names, token_relevance)
        s = sorted(zipped, key=lambda x: -x[-1])[:6]
        user_profiles[idx] = s
    return user_profiles
        
user_profiles = calculate_user_profiles(merged["user_id"].unique())


100%|██████████| 231/231 [00:07<00:00, 29.50it/s]


In [12]:
subcategory_to_id = {name: idx+1 for idx, name in enumerate(tfidf_feature_names)}
id_to_subcategory = {idx: name for name, idx in subcategory_to_id.items()}
id_to_subcategory[0] = "Null"
subcategory_to_id["Null"] = 0

In [13]:
# add all id-category to the userprofile in df
profile_array = []
for index, row in tqdm(merged.iterrows()):
    
    user_idx = row["user_id"]
    profile = user_profiles[user_idx]
    temp = []
    for keyword_tuple in profile:
        temp.append(subcategory_to_id[keyword_tuple[0]])
    profile_array.append(temp)
merged["profile"] = profile_array

9728it [00:00, 11841.04it/s]


In [14]:
# add the id-category to the news articles
merged["subcategory_to_int"] = [subcategory_to_id[cat] for cat in merged["subcategory_cleaned"].values]

user_unique = merged.drop_duplicates("user_id")
userid_to_profile = user_unique[["user_id", "profile"]].set_index("user_id").to_dict()["profile"]

category_enc = LabelEncoder()
merged["main_category_int"] = category_enc.fit_transform(merged["category"].values)
article_id_to_category_int = merged[["article_id", "main_category_int"]].set_index("article_id").to_dict()
article_id_to_category_int = article_id_to_category_int["main_category_int"]

article_id_to_subcategory_int = merged[["article_id", "subcategory_to_int"]].set_index("article_id").to_dict()
article_id_to_subcategory_int = article_id_to_subcategory_int["subcategory_to_int"]

In [15]:
def train_test_split(df, user_id, article_id, have_timestamp, timestamp):
    """
    params: 
        col_1: user_id
        col_2: article_id
    """
    df_test = df
    if have_timestamp: # if df have timestamp; take last interacted article into test set
        df_test = df_test.sort_values(timestamp).groupby(user_id).tail(1)
    else:
        df_test = df_test.sort_values(user_id).groupby(user_id).tail(1)
    df_train = df.drop(index=df_test.index)
    
    assert df_test.shape[0] + df_train.shape[0] == df.shape[0]
    
    return df_train, df_test
df_train_true, df_test_true = train_test_split(merged, "user_id", "article_id", False, 0)

In [17]:
df_train_true.head(1)

Unnamed: 0,user,news_id,category,sub_category,title,abstract,url,title_entities,abstract_entities,user_id,article_id,title_cleaned,subcategory_cleaned,profile,subcategory_to_int,main_category_int
0,U13740,N55189,tv,tvnews,"'Wheel Of Fortune' Guest Delivers Hilarious, O...","We'd like to solve the puzzle, Pat: Blair Davi...",https://assets.msn.com/labs/mind/AAIORni.html,[],"[{""Label"": ""Pat Sajak"", ""Type"": ""P"", ""Wikidata...",13,3563,wheel fortune guest delivers hilarious rails i...,tvnews,"[119, 151, 23, 61, 74, 99]",151,11


In [16]:
def get_userid_to_article_history(df):
    userid_to_article_history = {}
    for user_id in tqdm(df["user_id"].unique()):
        click_history = df[df["user_id"] == user_id]["article_id"].values
        if len(click_history) < 30:
            while len(click_history) < 30:
                click_history = np.append(click_history, 0)
        if len(click_history) > 30:
            click_history = click_history[:30]
        userid_to_article_history[user_id] = click_history
    return userid_to_article_history
userid_to_article_history = get_userid_to_article_history(df_train_true)

100%|██████████| 231/231 [00:00<00:00, 1400.28it/s]


In [None]:
all_article_ids = merged["article_id"].unique()

def negative_sampling(train_df, all_article_ids, user_id, article_id):
    """
    Negative sample training instance; for each positive instance, add 4 negative articles
    
    Return user_ids, news_ids, category_1, category_2, authors_onehotencoded, titles
    """
    
    user_ids, user_click_history, articles, article_category, article_sub_category, labels = [], [], [], [], [], []
    p0, p1, p2, p3, p4, p5, p6, p7, p8, p9 = [], [], [], [], [], [], [], [], [], []
    user_item_set = set(zip(train_df[user_id], 
                            train_df[article_id]))
    num_negatives = 4

    for (u, i) in tqdm(user_item_set):
        user_ids.append(u)
        user_click_history.append(userid_to_article_history[u])
        profile = np.array(userid_to_profile[u])
        p0.append(profile[0])
        p1.append(profile[1])
        p2.append(profile[2])
        p3.append(profile[3])
        p4.append(profile[4])
        p5.append(profile[5])
        article_category.append(article_id_to_category_int[i])
        article_sub_category.append(article_id_to_subcategory_int[i])
        
        articles.append(i)
        labels.append(1)
        for _ in range(num_negatives):
            negative_item = np.random.choice(all_article_ids)
            while (u, negative_item) in user_item_set:
                negative_item = np.random.choice(all_article_ids)
            user_ids.append(u)
            user_click_history.append(userid_to_article_history[u])
            p0.append(profile[0])
            p1.append(profile[1])
            p2.append(profile[2])
            p3.append(profile[3])
            p4.append(profile[4])
            p5.append(profile[5])
            
            article_category.append(article_id_to_category_int[negative_item])
            article_sub_category.append(article_id_to_subcategory_int[negative_item])
            
            articles.append(negative_item)
            labels.append(0)
    
    user_ids, user_click_history, p0, p1, p2, p3, p4, p5, articles,article_category,article_sub_category, labels = shuffle(user_ids,user_click_history, p0, p1, p2, p3, p4, p5, articles,article_category,article_sub_category, labels, random_state=0)

    return pd.DataFrame(list(zip(user_ids,user_click_history,p0, p1, p2, p3, p4, p5, articles,article_category,article_sub_category, labels)), columns=["user_id","user_history","p0", "p1", "p2", "p3", "p4", "p5", "article_id","article_category","article_sub_category", "labels"])



df_train = negative_sampling(df_train_true, all_article_ids, "user_id", "article_id")

In [None]:
def fix_dftrain(df, column, max_len, padding):
    i = 0
    for i in tqdm(range(max_len)):
        df[column + "_" + str(i)] = df[column].apply(lambda x: x[i] if i < len(x) else padding)
    #df.drop(column, axis=1, inplace=True)
    return df

df_train = fix_dftrain(df_train, "user_history", 30, 0)
df_train.drop(columns=["user_history"], inplace=True)
df_train.head()

In [None]:
# For each user; for each item the user has interacted with in the test set;
    # Sample 99 items the user has not interacted with in the past and add the one test item  
    
def negative_sample_testset(ordiginal_df, df_test, all_article_ids, user_id, article_id):
    test_user_item_set = set(zip(df_test[user_id], df_test[article_id]))
    user_interacted_items = ordiginal_df.groupby(user_id)[article_id].apply(list).to_dict()
    users = []
    p0, p1, p2, p3, p4, p5, p6, p7, p8, p9 = [], [], [], [], [], [], [], [], [], []
    res_arr = []
    article_category, article_sub_category = [], []
    
    userid_to_true_item = {} # keep track of the real items
    for (u,i) in tqdm(test_user_item_set):
        interacted_items = user_interacted_items[u]
        not_interacted_items = set(all_article_ids) - set(interacted_items)
        selected_not_interacted = list(np.random.choice(list(not_interacted_items), 99))
        test_items = selected_not_interacted + [i]
        temp = []
        profile = userid_to_profile[u]
        for j in range(len(test_items)):
            temp.append([u,
                         userid_to_article_history[u], 
                         profile[0],
                         profile[1],
                         profile[2],
                         profile[3],
                         profile[4],
                         profile[5], 
                         test_items[j], article_id_to_category_int[test_items[j]],
                        article_id_to_subcategory_int[test_items[j]]])
        #            user_click_history.append(userid_to_article_history[u])

        res_arr.append(temp)
        userid_to_true_item[u] = i 
    X_test = np.array(res_arr)
    X_test = X_test.reshape(-1, X_test.shape[-1])
    df_test = pd.DataFrame(X_test, columns=["user_id",
                                            "click_history", 
                                            "p0", 
                                            "p1", 
                                            "p2", 
                                            "p3", 
                                            "p4", 
                                            "p5",
                                            "article_id", 
                                            "category", 
                                            "sub_category"])
    return X_test, df_test, userid_to_true_item
X_test, df_test, userid_to_true_item = negative_sample_testset(merged, df_test_true, merged["article_id"].unique(), "user_id", "article_id")
    
    

In [None]:
def fix_dftest(df, column, max_len, padding):
    i = 0
    for i in tqdm(range(max_len)):
        df[column + "_" + str(i)] = df[column].apply(lambda x: x[i] if i < len(x) else padding)
    #df.drop(column, axis=1, inplace=True)
    return df

df_test = fix_dftest(df_test, "click_history", 30, 0)
df_test.drop(columns=["click_history"], inplace=True)

In [None]:
def getHitRatio(ranklist, gtItem):
    for item in ranklist:
        if item == gtItem:
            return 1
    return 0

def getNDCG(ranklist, gtItem):
    for i in range(len(ranklist)):
        item = ranklist[i]
        if item == gtItem:
            return math.log(2) / math.log(i+2)
    return 0

def getNumHits(ranklist, gtItem):
    h = 0
    for item in ranklist:
        for p in gtItem:
            if 

In [None]:
def evaluate_one_rating(model, user_id, user_profiles, all_articles,user_clicks, true_item, categories, sub_categories):
    ### Reshaping to make it on the right shape ###
    #expanded_user_id = np.array([user_id]*100).reshape((100,1))
    all_articles = np.array(all_articles).reshape(-1,1)
    
    # predictions
    #user_history, profile_input, item_input,category_input, subcategory_input
    predictions = model.predict([user_clicks, user_profiles, all_articles, categories, sub_categories]) #TODO: add categories, sub_cat
    predicted_labels = np.squeeze(predictions)
    top_ten_items = [all_articles[i] for i in np.argsort(predicted_labels)[::-1][0:10].tolist()]
    hr = getHitRatio(top_ten_items, true_item)
    ndcg = getNDCG(top_ten_items, true_item)
    hr_five = getHitRatio(top_ten_items[:5], true_item)
    ndcg_five = getNDCG(top_ten_items[:5], true_item)
    return hr, ndcg,hr_five,ndcg_five

def evalaute_model(model, df_test, userid_to_true_item):
    print("Evaluate model")
    hits = []
    ndcgs = []
    hits_five = []
    ndcgs_five = []
    users = df_test["user_id"].unique()
    for user_id in tqdm(users):
        user_df = df_test[df_test["user_id"] == user_id] # get the 100 samples for this user
        true_item = userid_to_true_item[user_id] # get the actual true item in the test set
        all_articles = user_df["article_id"].values.astype("int64") # get all possible articles
        user_profiles = user_df.iloc[:, 1:7].values.astype("int64")# get the user_profile
        user_clicks = user_df.iloc[:, 10:].values.astype("int64")
        categories = user_df.iloc[:, 8].values.astype("int64")
        sub_categories = user_df.iloc[:, 9].values.astype("int64")
        
        ht, ndcg, ht_five, ndcg_five = evaluate_one_rating(model, 
                                       user_id, 
                                       user_profiles, 
                                       all_articles,user_clicks, 
                                       true_item, 
                                       categories, 
                                       sub_categories)
        hits.append(ht)
        ndcgs.append(ndcg)
        hits_five.append(ht_five)
        ndcgs_five.append(ndcg_five)
    return hits, ndcgs,hits_five,ndcgs_five

In [None]:
def write_accuracy_results(model_name, hit_ten, ndcg_ten, hit_five, ndcg_five):
    try:
        file = open("performance.txt", "a")
        s = model_name +": Hit@10 : "+ str(hit_ten)+", NDCG@10: "+ str(ndcg_ten)+", Hit@5:" + str(hit_five)+", ndcg@5 "+ str(ndcg_five) + "\n"
        file.write(s)
        file.close()
    except:
        print("error file wriite")

def write_category_results(model_name, hit_ten, ndcg_ten, hit_five, ndcg_five):
    try:
        file = open("category_performance.txt", "a")
    
        s = model_name +": Hit@10 : "+ str(hit_ten)+", NDCG@10: "+ str(ndcg_ten)+", Hit@5:" + str(hit_five)+", ndcg@5 "+ str(ndcg_five) + "\n"
        file.write(s)
        file.close()
    except:
        print("error file wriite")
    

# 4. Models

## 4.1 Final model

In [None]:
# Params
num_unique_categories = len(subcategory_to_id)
num_users = len(merged["user_id"].unique()) +1
num_items = len(merged["article_id"].unique()) + 1
dims = 20
num_sub_categories = len(merged["subcategory_to_int"].unique()) +1
num_categories = len(merged["main_category_int"].unique()) +1

In [None]:
#@tf.autograph.experimental.do_not_convert
def get_model(num_users, num_items, dims,num_categories,num_sub_categories, dense_layers=[128, 64, 32, 8]):
    #User features
    user_history = Input(shape=(30,), name="user")
    user_profile_input = Input(shape=(6,), name="profile")
    #item features
    item_input = Input(shape=(1,), name="item")
    item_category = Input(shape=(1,), name="category")
    item_subcategory = Input(shape=(1,), name="subcategory")
    
    # User emb
    click_history_emb = Embedding(output_dim=dims, input_dim=num_items+1, input_length=30, name="mf_user_emb")(user_history)
    profile_emb = Embedding(output_dim=dims, input_dim=num_unique_categories, input_length=6, name="mf_profile_emb")(user_profile_input)
    
    # Item emb
    item_emb = Embedding(output_dim=dims, input_dim=num_items+1, input_length=1, name="mf_item_emb")(item_input)
    category_emb = Embedding(output_dim=dims, input_dim=num_categories, input_length=1, name="cat_emb")(item_category)
    subcategory_emb = Embedding(output_dim=dims, input_dim=num_sub_categories, input_length=1, name="subcat_emb")(item_subcategory)
    
    ### Wide
    #wide_history = Flatten()(click_history_emb)
    #wide_item = Flatten()(item_input)
    wide = Concatenate(axis=1)([click_history_emb, item_emb])
    wide = Flatten()(wide)
    y_wide = Dense(2)(wide)
    
    ### Deep
    deep_features = Concatenate(axis=1)([category_emb,subcategory_emb, profile_emb])
    x_deep = LSTM(40)(deep_features)
    
    print(x_deep.shape)
    print(y_wide.shape)
    
    final = Concatenate()([x_deep, y_wide])
    final = BatchNormalization(axis=1)(final)
   
    y = Dense(1, activation="sigmoid")(final)
    
    
    model = Model(inputs=[user_history, user_profile_input, item_input, item_category, item_subcategory], outputs=y)
    model.compile(
        optimizer=Adam(0.01),
        loss="binary_crossentropy",
        metrics=["accuracy"],
    )
    return model

model = get_model(num_users, num_items, dims, num_categories,num_sub_categories)

In [None]:
###### Training ########
user_history = df_train.iloc[:, 11:].values.astype("int64")
profile_input = df_train.iloc[:, 1:7].values.astype("int64")
item_input = df_train.iloc[:, 7].values.reshape((-1,1)).astype("int64")
labels = df_train.iloc[:, 10].values.reshape((-1,1)).astype("int64")
category_input = df_train.iloc[:, 8].values.reshape((-1,1)).astype("int64")
subcategory_input = df_train.iloc[:, 9].values.reshape((-1,1)).astype("int64")
print(user_history.shape,profile_input.shape, item_input.shape, labels.shape )

In [None]:
all_user_ids = merged["user_id"].unique()

#user_input = df_train.iloc[:, 0].values.reshape((-1,1))
#profile_input = df_train.iloc[:, 1:6].values
#item_input = df_train.iloc[:, 7].values.reshape((-1,1))
#labels = df_train.iloc[:, 8].values.reshape((-1,1))

train_loss = []
val_loss = []
train_acc = []
val_acc = []

hits_list = []
ndcg_list = []
best_hits = 0
best_ndcgs = 0
best_hits_five = 0
best_ndcgs_five = 0

epochs=2
for epoch in range(epochs):
    hist = model.fit([user_history, profile_input, item_input,category_input, subcategory_input ], labels, epochs=1,validation_split=0.1, shuffle=True, verbose=1)    
    train_loss.append(hist.history["loss"])
    train_acc.append(hist.history["accuracy"])
    val_loss.append(hist.history["val_loss"])
    val_acc.append(hist.history["val_accuracy"])
    
    hits, ndcgs, hits_five, ndcgs_five = evalaute_model( model, df_test, userid_to_true_item)
    hits_list.append(np.average(hits))
    ndcg_list.append(np.average(ndcgs))
    
    temp_hits = np.average(hits)
    temp_ndcgs = np.average(ndcgs)
    if (temp_hits > best_hits):
        best_hits = temp_hits
        best_ndcgs = temp_ndcgs
        best_hits_five = np.average(hits_five)
        best_ndcgs_five = np.average(ndcgs_five)

In [None]:
print("Hit @ 10: {:.2f}".format(best_hits))
print("ncdgs @ 10: {:.2f}".format(best_ndcgs))
print("Hit @ 10: {:.2f}".format(best_hits_five))
print("ncdgs @ 10: {:.2f}".format(best_ndcgs_five))

In [None]:
write_accuracy_results("main", best_hits, best_ndcgs, best_hits_five, best_ndcgs_five)

In [None]:
import matplotlib.pyplot as plt
sns.set_style("darkgrid")
plt.plot(train_acc)
plt.plot(val_acc)
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['Train', 'Val'], loc='upper left')
plt.savefig("final_accuracy.pdf")
plt.show()


In [None]:
sns.set_style("darkgrid")
plt.plot(train_loss)
plt.plot(val_loss)
plt.title('Loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['Train', 'Val'], loc='upper left')
plt.savefig("final_loss.pdf")
plt.show()

In [None]:
sns.set_style("darkgrid")
plt.plot(hits_list)
plt.plot(train_loss)
plt.title('Hit ratio vs Loss')
plt.xlabel('epoch')
plt.legend(['Hit@10', 'Train loss'], loc='upper left')
plt.savefig("final_hit_loss.pdf")
plt.show()

In [None]:
def get_article_category(article_id):
    return merged[merged["article_id"] == article_id]["subcategory_cleaned"].values[0]
def get_userprofile_to_name(user_id, id_to_subcategory):
    """
    Return array of strings with category names
    """
    arr_profile = get_user_profile(df_train,user_id )
    return [id_to_subcategory[elem] for elem in arr_profile]
def get_user_profile(df, user_id):
    """
    Return the user profile given user_id
    """
    return df[df["user_id"] == user_id].iloc[0, 1:7].values
def get_article_content(article_id):
    article = merged[merged["article_id"] == article_id].head(1)
    title = article["title"].values[0]
    sub_category = article["sub_category"].values[0]
    return title, sub_category

def get_item_features(user_id):
    d = df_test[df_test["user_id"] == user_id]
    return d["category"].values.reshape(-1,1), d["sub_category"].values.reshape(-1,1)

def get_item_features_one_item(article_id):
    d = df_test[df_test["article_id"] == article_id]
    return np.array(d["category"].values[0]), np.array(d["sub_category"].values[0])

In [None]:
def get_article_category(article_id, df):
    """
    Return the article's category
        type: int
    """
    return df[df["article_id"] == article_id]["category"].values[0]
def get_article_subcategory(article_id, df):
    """
    Return the article's category
        type: int
    """
    return df[df["article_id"] == article_id]["sub_category"].values[0]
def get_category_hit_ratio(user_profile, top_ten_categories):
    for profile in user_profile:
        for category in top_ten_categories:
            if profile == category:
                return 1
    return 0
def get_ndcgs_category(user_profile, top_ten_categories):
    for i in range(len(top_ten_categories)):
        item = top_ten_categories[i]
        for profile in user_profile:
            if item == profile:
                return math.log(2) / math.log(i+2)
    return 0

In [None]:
def get_recommendations(user_id, df):
    
    ## Setup ###
    user_profile = get_user_profile(df, user_id)
    click_history = userid_to_article_history[user_id]
    display_items = df[df["user_id"] == user_id]["article_id"].values.reshape(-1, 1).astype("int64")
    user_profile = np.tile(user_profile, display_items.shape[0]).reshape(-1, 6).astype("int64")
    category, sub_category = get_item_features(user_id)
    user_ids = np.tile(np.array(user_id), display_items.shape[0]).reshape(-1,1).astype("int64")
    category = np.asarray(category).astype("int64")
    sub_category = np.asarray(sub_category).astype("int64")
    click_history = np.tile(click_history, display_items.shape[0]).reshape(-1, 30).astype("int64")

    ## Preds ###
    predictions = model.predict([click_history, user_profile, display_items, category, sub_category])
    predicted_labels = np.squeeze(predictions)
    top_ten_items = [display_items[i][0] for i in np.argsort(predicted_labels)[::-1][0:10].tolist()]
    return top_ten_items



In [None]:
def predict_all_users(df):
    hits_ten,ndcgs_ten = [], []
    hits_five, ndcgs_five = [], []

    counter = 0
    for user_id in tqdm(df["user_id"].unique()):
        top_ten_articles = get_recommendations(user_id, df)
        top_ten_subcategories = [get_article_subcategory(_id, df) for _id in top_ten_articles]
        user_profile = get_user_profile(df_test, user_id)

        hit_ten = get_category_hit_ratio(user_profile, top_ten_subcategories)
        ndcg_ten = get_ndcgs_category(user_profile, top_ten_subcategories)
        
        hit_five = get_category_hit_ratio(user_profile, top_ten_subcategories[:5])
        ndcg_five = get_ndcgs_category(user_profile, top_ten_subcategories[:5])
        
        hits_ten.append(hit_ten)
        ndcgs_ten.append(ndcg_ten)
        hits_five.append(hit_five)
        ndcgs_five.append(ndcg_five)
        counter += 1
    return np.average(hits_ten), np.average(ndcgs_ten), np.average(hits_five), np.average(ndcgs_five)
        
        
category_hits_ten, category_ndcg_ten,category_hits_five,category_ndcg_five   = predict_all_users(df_test)

In [None]:
print(category_hits_ten)
print(category_ndcg_ten)
print(category_hits_five)
print(category_ndcg_five)

In [None]:
write_category_results("main", category_hits_ten, category_ndcg_ten, category_hits_five, category_ndcg_five)

# 4.1 Architecture 1

In [None]:
def evaluate_one_rating_arc1(model, user_id, all_articles, true_item):
    ### Reshaping to make it on the right shape ###
    expanded_user_id = np.array([user_id]*100).reshape((100,1))
    all_articles = np.array(all_articles).reshape(-1,1)
    
    # predictions
    predictions = model.predict([expanded_user_id, all_articles])
    predicted_labels = np.squeeze(predictions)
    top_ten_items = [all_articles[i] for i in np.argsort(predicted_labels)[::-1][0:10].tolist()]
    hr = getHitRatio(top_ten_items, true_item)
    ndcg = getNDCG(top_ten_items, true_item)
    hr_five = getHitRatio(top_ten_items[:5], true_item)
    ndcg_five = getNDCG(top_ten_items[:5], true_item)
    return hr, ndcg, hr_five, ndcg_five

def evalaute_model_arc1(model, df_test, userid_to_true_item):
    print("Evaluate model")
    hits = []
    ndcgs = []
    hits_five, ndcgs_five = [], []
    users = df_test["user_id"].unique()
    for user_id in tqdm(users):
        user_df = df_test[df_test["user_id"] == user_id] # get the 100 samples for this user
        true_item = userid_to_true_item[user_id] # get the actual true item in the test set
        all_articles = user_df["article_id"].values.astype("int64") # get all possible articles
        
        
        ht, ndcg, hr_five, ndcg_five = evaluate_one_rating_arc1(model, user_id, all_articles, true_item)
        hits.append(ht)
        ndcgs.append(ndcg)
        hits_five.append(hr_five)
        ndcgs_five.append(ndcg_five)
    return hits, ndcgs, hits_five, ndcgs_five

In [None]:
# Params
num_users = len(merged["user_id"].unique())
num_items = len(merged["article_id"].unique()) 
dims = 20

In [None]:
#@tf.autograph.experimental.do_not_convert
def get_model(num_users, num_items, dims, dense_layers=[128, 64, 32, 8]):
    user_input = Input(shape=(1,), name="user")
    item_input = Input(shape=(1,), name="item")
    
    user_emb = Embedding(output_dim=dims, input_dim=num_users, input_length=1, name="mf_user_emb")(user_input)
    item_emb = Embedding(output_dim=dims, input_dim=num_items, input_length=1, name="mf_item_emb")(item_input)
    
    user_vecs = Reshape([dims])(user_emb)
    item_vecs = Reshape([dims])(item_emb)
    
    y = Dot(1, normalize=False)([user_vecs, item_vecs])
    
    y = Dense(1, activation="sigmoid")(y)
    
    
    model = Model(inputs=[user_input, item_input], outputs=y)
    model.compile(
        optimizer=Adam(0.01),
        loss="binary_crossentropy",
        metrics=["accuracy"],
    )
    return model

model_arc1 = get_model(num_users, num_items, dims)

In [None]:
df_train

In [None]:
###### Training ########
user_input = df_train.iloc[:, 0].values.reshape((-1,1))
item_input = df_train.iloc[:, 7].values.reshape((-1,1))
labels = df_train.iloc[:, 10].values.reshape((-1,1))
print(user_input.shape, item_input.shape, labels.shape )

In [None]:
all_user_ids = df_train["user_id"].unique()

#user_input = df_train.iloc[:, 0].values.reshape((-1,1))
#profile_input = df_train.iloc[:, 1:6].values
#item_input = df_train.iloc[:, 7].values.reshape((-1,1))
#labels = df_train.iloc[:, 8].values.reshape((-1,1))


train_loss = []
val_loss = []
train_acc = []
val_acc = []

hits_list = []
ndcg_list = []
best_hits = 0
best_ndcgs = 0

epochs=2
for epoch in range(epochs):
    hist = model_arc1.fit([user_input, item_input], labels, epochs=1, shuffle=True, verbose=1, validation_split=0.1)
    
    train_loss.append(hist.history["loss"])
    train_acc.append(hist.history["accuracy"])
    val_loss.append(hist.history["val_loss"])
    val_acc.append(hist.history["val_accuracy"])
    
    hits, ndcgs, hits_five, ndcgs_five = evalaute_model_arc1( model_arc1, df_test, userid_to_true_item)
    hits_list.append(np.average(hits))
    ndcg_list.append(np.average(ndcgs))
    
    temp_hits = np.average(hits)
    temp_ndcgs = np.average(ndcgs)
    if (temp_hits > best_hits):
        best_hits = temp_hits
        best_ndcgs = temp_ndcgs
        best_hits_five = np.average(hits_five)
        best_ndcgs_five = np.average(ndcgs_five)
    
        
    
    

In [None]:
print(best_hits)
print(best_ndcgs)
print(best_hits_five)
print(best_ndcgs_five)

In [None]:
write_accuracy_results("arc1", best_hits, best_ndcgs, best_hits_five, best_ndcgs_five)

In [None]:
import matplotlib.pyplot as plt
sns.set_style("darkgrid")
plt.plot(train_acc)
plt.plot(val_acc)
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['Train', 'Val'], loc='upper left')
plt.savefig("arc1_accuracy.pdf")
plt.show()


In [None]:
sns.set_style("darkgrid")
plt.plot(train_loss)
plt.plot(val_loss)
plt.title('Loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['Train', 'Val'], loc='upper left')
plt.savefig("arc1_loss.pdf")
plt.show()

In [None]:
sns.set_style("darkgrid")
plt.plot(hits_list)
plt.plot(train_loss)
plt.title('Hit ratio vs Loss')
plt.xlabel('epoch')
plt.legend(['Hit@10', 'Train loss'], loc='upper left')
plt.savefig("arc1_hit_loss.pdf")
plt.show()

In [None]:
def get_recommendations_arc1(user_id, df, model):
    
    ## Setup ###
    display_items = df[df["user_id"] == user_id]["article_id"].values.reshape(-1, 1).astype("int64")
    user_ids = np.tile(np.array(user_id), display_items.shape[0]).reshape(-1,1).astype("int64")
    
    ## Preds ###
    predictions = model.predict([user_ids, display_items])
    predicted_labels = np.squeeze(predictions)
    top_ten_items = [display_items[i][0] for i in np.argsort(predicted_labels)[::-1][0:10].tolist()]
    return top_ten_items

In [None]:
def get_category_hits_ndcg_arc1(df, model):
    hits_ten,ndcgs_ten = [], []
    hits_five, ndcgs_five = [], []
    
    for user_id in tqdm(df["user_id"].unique()):
        top_ten_articles = get_recommendations_arc1(user_id, df, model)
        top_ten_subcategories = [get_article_subcategory(_id, df) for _id in top_ten_articles]
        user_profile = userid_to_profile[user_id]

        hit_ten = get_category_hit_ratio(user_profile, top_ten_subcategories)
        ndcg_ten = get_ndcgs_category(user_profile, top_ten_subcategories)
        
        hit_five = get_category_hit_ratio(user_profile, top_ten_subcategories[:5])
        ndcg_five = get_ndcgs_category(user_profile, top_ten_subcategories[:5])
        
        hits_ten.append(hit_ten)
        ndcgs_ten.append(ndcg_ten)
        
        hits_five.append(hit_five)
        ndcgs_five.append(ndcg_five)
        
        
    return np.average(hits_ten), np.average(ndcgs_ten), np.average(hits_five), np.average(ndcgs_five)
        
        
category_hits_ten, category_ndcg_ten, category_hits_five, category_ndcg_five = get_category_hits_ndcg_arc1(df_test,model_arc1)

In [None]:
print(category_hits_ten)
print(category_ndcg_ten)
print(category_hits_five)
print(category_ndcg_five)

In [None]:
write_category_results("arc1", category_hits_ten, category_ndcg_ten, category_hits_five, category_ndcg_five)

# 4.3 Architecture 2

In [None]:
def evaluate_one_rating_arc2(model, user_id, user_profiles, all_articles,user_clicks, true_item):
    ### Reshaping to make it on the right shape ###
    #expanded_user_id = np.array([user_id]*100).reshape((100,1))
    all_articles = np.array(all_articles).reshape(-1,1)
    
    # predictions
    predictions = model.predict([user_clicks, user_profiles, all_articles])
    predicted_labels = np.squeeze(predictions)
    top_ten_items = [all_articles[i] for i in np.argsort(predicted_labels)[::-1][0:10].tolist()]
    hr = getHitRatio(top_ten_items, true_item)
    ndcg = getNDCG(top_ten_items, true_item)
    hr_five = getHitRatio(top_ten_items[:5], true_item)
    ndcg_five = getNDCG(top_ten_items[:5], true_item)
    return hr, ndcg, hr_five,ndcg_five

def evalaute_model_arc2(model, df_test, userid_to_true_item):
    print("Evaluate model")
    hits = []
    ndcgs = []
    hits_five, ndcgs_five = [], []
    users = df_test["user_id"].unique()
    for user_id in tqdm(users):
        user_df = df_test[df_test["user_id"] == user_id] # get the 100 samples for this user
        true_item = userid_to_true_item[user_id] # get the actual true item in the test set
        all_articles = user_df["article_id"].values.astype("int64") # get all possible articles
        user_profiles = user_df.iloc[:, 1:7].values.astype("int64")# get the user_profile
        user_clicks = user_df.iloc[:, 10:].values.astype("int64")
        
        ht, ndcg,hr_five,ndcg_five = evaluate_one_rating_arc2(model, user_id, user_profiles, all_articles,user_clicks, true_item)
        hits.append(ht)
        ndcgs.append(ndcg)
        hits_five.append(hr_five)
        ndcgs_five.append(ndcg_five)
    return hits, ndcgs,hits_five,ndcgs_five

In [None]:
#@tf.autograph.experimental.do_not_convert
def get_model_arc2(num_users, num_items, dims, dense_layers=[128, 64, 32, 8]):
    user_history = Input(shape=(30,), name="user")
    user_profile_input = Input(shape=(6,), name="profile")
    item_input = Input(shape=(1,), name="item")
    
    mf_user_emb = Embedding(output_dim=dims, input_dim=num_items+1, input_length=30, name="mf_user_emb")(user_history)
    mf_profile_emb = Embedding(output_dim=dims, input_dim=num_unique_categories, input_length=6, name="mf_profile_emb")(user_profile_input)
    mf_item_emb = Embedding(output_dim=dims, input_dim=num_items+1, input_length=1, name="mf_item_emb")(item_input)
    
    
    
    #profile_emb = GlobalAveragePooling1D()(mf_profile_emb)
    profile_vecs = Flatten()(mf_user_emb)
    user_vecs = Flatten()(mf_profile_emb)
    item_vecs = Reshape([dims])(mf_item_emb)
    
    user_vecs_complete = Concatenate(axis=1)([user_vecs, profile_vecs])
    input_vecs = Concatenate()([user_vecs_complete, item_vecs])
    x = Dense(128, activation="relu", name="dense_0")(input_vecs)
    x = Dropout(0.5)(x)
    y = Dense(1, activation="sigmoid", name="prediction")(x)
    
    
    model = Model(inputs=[user_history, user_profile_input, item_input], outputs=y)
    model.compile(
        optimizer=Adam(0.01),
        loss="binary_crossentropy",
        metrics=["accuracy"],
    )
    return model

model_arc2 = get_model_arc2(num_users, num_items, dims)

In [None]:
###### Training ########
user_history = df_train.iloc[:, 11:].values.astype("int64")
profile_input = df_train.iloc[:, 1:7].values.astype("int64")
item_input = df_train.iloc[:, 7].values.reshape((-1,1)).astype("int64")
labels = df_train.iloc[:, 10].values.reshape((-1,1)).astype("int64")
print(user_history.shape,profile_input.shape, item_input.shape, labels.shape )

In [None]:
all_user_ids = merged["user_id"].unique()

epochs=2

train_loss = []
val_loss = []
train_acc = []
val_acc = []

hits_list = []
ndcg_list = []
best_hits = 0
best_ndcgs = 0
best_hits_five = 0
best_ndcgs_five = 0

for epoch in range(epochs):
    hist = model_arc2.fit([user_history, profile_input, item_input], labels, epochs=1, shuffle=True, verbose=1, validation_split=0.1)
    
    train_loss.append(hist.history["loss"])
    train_acc.append(hist.history["accuracy"])
    val_loss.append(hist.history["val_loss"])
    val_acc.append(hist.history["val_accuracy"])
    
    hits, ndcgs, hits_five, ndcgs_five = evalaute_model_arc2( model_arc2, df_test, userid_to_true_item)
    hits_list.append(np.average(hits))
    ndcg_list.append(np.average(ndcgs))
    
    temp_hits = np.average(hits)
    temp_ndcgs = np.average(ndcgs)
    if (temp_hits > best_hits):
        best_hits = temp_hits
        best_hits_five = np.average(hits_five)
        best_ndcgs_five = np.average(ndcgs_five)
        best_ndcgs = temp_ndcgs
        

In [None]:
print("Hit @ 10: {:.2f}".format(best_hits))
print("ncdgs @ 10: {:.2f}".format(best_ndcgs))
print("Hit @ 5: {:.2f}".format(best_hits_five))
print("ncdgs @ 5: {:.2f}".format(best_ndcgs_five))

In [None]:
write_accuracy_results("arc2", best_hits, best_ndcgs, best_hits_five, best_ndcgs_five)

In [None]:
sns.set_style("darkgrid")
plt.plot(train_acc)
plt.plot(val_acc)
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['Train', 'Val'], loc='upper left')
plt.savefig("arc2_accuracy.pdf")
plt.show()


In [None]:
sns.set_style("darkgrid")
plt.plot(train_loss)
plt.plot(val_loss)
plt.title('Loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['Train', 'Val'], loc='upper left')
plt.savefig("arc2_loss.pdf")
plt.show()

In [None]:
sns.set_style("darkgrid")
plt.plot(hits_list)
plt.plot(train_loss)
plt.title('Hit ratio vs Loss')
plt.xlabel('epoch')
plt.legend(['Hit@10', 'Train loss'], loc='upper left')
plt.savefig("arc2_hit_loss.pdf")
plt.show()

In [None]:
def get_recommendations_arc2(user_id, df, model):
    #user_history, profile_input, item_input
    ## Setup ###
    
    click_history = userid_to_article_history[user_id]
    user_profile = get_user_profile(df, user_id)
    display_items = df[df["user_id"] == user_id]["article_id"].values.reshape(-1, 1).astype("int64")
    user_profile = np.tile(user_profile, display_items.shape[0]).reshape(-1, 6).astype("int64")
    click_history = np.tile(np.array(click_history), display_items.shape[0]).reshape(-1,30).astype("int64")
    
    ## Preds ###
    predictions = model.predict([click_history, user_profile,display_items])
    predicted_labels = np.squeeze(predictions)
    top_ten_items = [display_items[i][0] for i in np.argsort(predicted_labels)[::-1][0:10].tolist()]
    return top_ten_items

In [None]:
def get_category_hits_ndcg_arc2(df, model):
    hits_ten,ndcgs_ten = [], []
    hits_five, ndcgs_five = [], []
    
    for user_id in tqdm(df["user_id"].unique()):
        top_ten_articles = get_recommendations_arc2(user_id, df, model)
        top_ten_subcategories = [get_article_subcategory(_id, df) for _id in top_ten_articles]
        user_profile = userid_to_profile[user_id]

        hit_ten = get_category_hit_ratio(user_profile, top_ten_subcategories)
        ndcg_ten = get_ndcgs_category(user_profile, top_ten_subcategories)
        
        hit_five = get_category_hit_ratio(user_profile, top_ten_subcategories[:5])
        ndcg_five = get_ndcgs_category(user_profile, top_ten_subcategories[:5])
        
        hits_ten.append(hit_ten)
        ndcgs_ten.append(ndcg_ten)
        
        hits_five.append(hit_five)
        ndcgs_five.append(ndcg_five)
        
        
    return np.average(hits_ten), np.average(ndcgs_ten), np.average(hits_five), np.average(ndcgs_five)
        
        
category_hits_ten, category_ndcg_ten, category_hits_five, category_ndcg_five = get_category_hits_ndcg_arc2(df_test, model_arc2)

In [None]:
print(category_hits_ten)
print(category_ndcg_ten)
print(category_hits_five)
print(category_ndcg_five)

In [None]:
write_category_results("arc2", category_hits_ten, category_ndcg_ten, category_hits_five, category_ndcg_five)

# 4.4 Architecture 3

In [None]:
def evaluate_one_rating_arc3(model, user_id, user_profiles, all_articles,user_clicks, true_item, categories, sub_categories):
    ### Reshaping to make it on the right shape ###
    #expanded_user_id = np.array([user_id]*100).reshape((100,1))
    all_articles = np.array(all_articles).reshape(-1,1)
    
    # predictions
    #user_history, profile_input, item_input,category_input, subcategory_input
    predictions = model.predict([user_clicks, user_profiles, all_articles, categories, sub_categories]) #TODO: add categories, sub_cat
    predicted_labels = np.squeeze(predictions)
    top_ten_items = [all_articles[i] for i in np.argsort(predicted_labels)[::-1][0:10].tolist()]
    hr = getHitRatio(top_ten_items, true_item)
    ndcg = getNDCG(top_ten_items, true_item)
    hr_five = getHitRatio(top_ten_items[:5], true_item)
    ndcg_five = getNDCG(top_ten_items[:5], true_item)
    return hr, ndcg, hr_five,ndcg_five 

def evalaute_model_arc3(model, df_test, userid_to_true_item):
    print("Evaluate model")
    hits = []
    ndcgs = []
    hits_five, ndcgs_five = [], []
    users = df_test["user_id"].unique()
    for user_id in tqdm(users):
        user_df = df_test[df_test["user_id"] == user_id] # get the 100 samples for this user
        true_item = userid_to_true_item[user_id] # get the actual true item in the test set
        all_articles = user_df["article_id"].values.astype("int64") # get all possible articles
        user_profiles = user_df.iloc[:, 1:7].values.astype("int64")# get the user_profile
        user_clicks = user_df.iloc[:, 10:].values.astype("int64")
        categories = user_df.iloc[:, 8].values.astype("int64")
        sub_categories = user_df.iloc[:, 9].values.astype("int64")
        
        ht, ndcg, hr_five,ndcg_five = evaluate_one_rating_arc3(model, 
                                       user_id, 
                                       user_profiles, 
                                       all_articles,user_clicks, 
                                       true_item, 
                                       categories, 
                                       sub_categories)
        hits.append(ht)
        ndcgs.append(ndcg)
        hits_five.append(hr_five)
        ndcgs_five.append(ndcg_five)
    return hits, ndcgs, hits_five,ndcgs_five

In [None]:
#@tf.autograph.experimental.do_not_convert
def get_model_arc3(num_users, num_items, dims,num_categories,num_sub_categories, dense_layers=[128, 64, 32, 8]):
    #User features
    user_history = Input(shape=(30,), name="user")
    user_profile_input = Input(shape=(6,), name="profile")
    #item features
    item_input = Input(shape=(1,), name="item")
    item_category = Input(shape=(1,), name="category")
    item_subcategory = Input(shape=(1,), name="subcategory")
    
    # User emb
    click_history_emb = Embedding(output_dim=dims, input_dim=num_items+1, input_length=30, name="mf_user_emb")(user_history)
    profile_emb = Embedding(output_dim=dims, input_dim=num_unique_categories, input_length=6, name="mf_profile_emb")(user_profile_input)
    
    #user_features = Concatenate(axis=1)([click_history_emb,profile_emb])
    
    # Item emb
    item_emb = Embedding(output_dim=dims, input_dim=num_items+1, input_length=1, name="mf_item_emb")(item_input)
    category_emb = Embedding(output_dim=dims, input_dim=num_categories, input_length=1, name="cat_emb")(item_category)
    subcategory_emb = Embedding(output_dim=dims, input_dim=num_sub_categories, input_length=1, name="subcat_emb")(item_subcategory)
    
    item_features = Concatenate(axis=1)([item_emb,category_emb, subcategory_emb, profile_emb])
    
    # User-tower
    user_lstm = LSTM(40)(click_history_emb)
    user_lstm = Dropout(0.5)(user_lstm)
    user_lstm = BatchNormalization(axis=1)(user_lstm)
    
    # Item tower
    item_dense = Flatten()(item_features)
    item_dense = Dense(128)(item_dense)
    item_dense = Dropout(0.5)(item_dense)
    item_dense = BatchNormalization(axis=1)(item_dense)
    
    # Click predictor
    final = Concatenate()([user_lstm,item_dense ])
    y = Dense(1, activation="sigmoid")(final)
    
    
    model = Model(inputs=[user_history, user_profile_input, item_input, item_category, item_subcategory], outputs=y)
    model.compile(
        optimizer=Adam(0.01),
        loss="binary_crossentropy",
        metrics=["accuracy"],
    )
    return model

model_arc3 = get_model_arc3(num_users, num_items, dims, num_categories,num_sub_categories)

In [None]:
###### Training ########
user_history = df_train.iloc[:, 11:].values.astype("int64")
profile_input = df_train.iloc[:, 1:7].values.astype("int64")
item_input = df_train.iloc[:, 7].values.reshape((-1,1)).astype("int64")
labels = df_train.iloc[:, 10].values.reshape((-1,1)).astype("int64")
category_input = df_train.iloc[:, 8].values.reshape((-1,1)).astype("int64")
subcategory_input = df_train.iloc[:, 9].values.reshape((-1,1)).astype("int64")
print(user_history.shape,profile_input.shape, item_input.shape, labels.shape )

In [None]:
all_user_ids = merged["user_id"].unique()

#user_input = df_train.iloc[:, 0].values.reshape((-1,1))
#profile_input = df_train.iloc[:, 1:6].values
#item_input = df_train.iloc[:, 7].values.reshape((-1,1))
#labels = df_train.iloc[:, 8].values.reshape((-1,1))

train_loss = []
val_loss = []
train_acc = []
val_acc = []

hits_list = []
ndcg_list = []
best_hits = 0
best_ndcgs = 0
best_hits_five = 0
best_ndcgs_five = 0

epochs=2
for epoch in range(epochs):
    hist = model_arc3.fit([user_history, profile_input, item_input,category_input, subcategory_input ], labels, validation_split=0.1, epochs=1, shuffle=True, verbose=1)
    
    train_loss.append(hist.history["loss"])
    train_acc.append(hist.history["accuracy"])
    val_loss.append(hist.history["val_loss"])
    val_acc.append(hist.history["val_accuracy"])
    
    hits, ndcgs, hits_five, ndcgs_five = evalaute_model_arc3( model_arc3, df_test, userid_to_true_item)
    hits_list.append(np.average(hits))
    ndcg_list.append(np.average(ndcgs))
    
    temp_hits = np.average(hits)
    temp_ndcgs = np.average(ndcgs)
    if (temp_hits > best_hits):
        best_hits = temp_hits
        best_hits_five = np.average(hits_five)
        best_ndcgs_five = np.average(ndcgs_five)
        best_ndcgs = temp_ndcgs

In [None]:
print("Hit @ 10: {:.2f}".format(best_hits))
print("ncdgs @ 10: {:.2f}".format(best_ndcgs))
print("Hit @ 5: {:.2f}".format(best_hits_five))
print("ncdgs @ 5: {:.2f}".format(best_ndcgs_five))

In [None]:
write_accuracy_results("arc3", best_hits, best_ndcgs, best_hits_five, best_ndcgs_five)

In [None]:
sns.set_style("darkgrid")
plt.plot(train_acc)
plt.plot(val_acc)
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['Train', 'Val'], loc='upper left')
plt.savefig("arc3_accuracy.pdf")
plt.show()


In [None]:
sns.set_style("darkgrid")
plt.plot(train_loss)
plt.plot(val_loss)
plt.title('Loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['Train', 'Val'], loc='upper left')
plt.savefig("arc3_loss.pdf")
plt.show()

In [None]:
sns.set_style("darkgrid")
plt.plot(hits_list)
plt.plot(train_loss)
plt.title('Hit ratio vs Loss')
plt.xlabel('epoch')
plt.legend(['Hit@10', 'Train loss'], loc='upper left')
plt.savefig("arc3_hit_loss.pdf")
plt.show()

In [None]:
def get_recommendations_arc3(user_id, df,model):
    #user_history, profile_input, item_input,category_input, subcategory_input
    ## Setup ###
    click_history = userid_to_article_history[user_id]
    user_profile = get_user_profile(df, user_id)
    display_items = df[df["user_id"] == user_id]["article_id"].values.reshape(-1, 1).astype("int64")
    user_profile = np.tile(user_profile, display_items.shape[0]).reshape(-1, 6).astype("int64")
    category, sub_category = get_item_features(user_id)
    click_history = np.tile(np.array(click_history), display_items.shape[0]).reshape(-1,30).astype("int64")
    category = np.asarray(category).astype("int64")
    sub_category = np.asarray(sub_category).astype("int64")
    #category = np.tile(category, display_items.shape[0]).reshape(-1,1).astype("int64")
    #sub_category = np.tile(sub_category, display_items.shape[0]).reshape(-1,1).astype("int64")
    
    ## Preds ###
    predictions = model.predict([click_history, user_profile, display_items, category, sub_category])
    predicted_labels = np.squeeze(predictions)
    top_ten_items = [display_items[i][0] for i in np.argsort(predicted_labels)[::-1][0:10].tolist()]
    return top_ten_items

def predict_all_users_arc3(df, model):
    hits_ten,ndcgs_ten = [], []
    hits_five, ndcgs_five = [], []
    counter = 0
    for user_id in tqdm(df["user_id"].unique()):
        top_ten_articles = get_recommendations_arc3(user_id, df, model)
        assert len(top_ten_articles) == 10
        top_ten_subcategories = [get_article_subcategory(_id, df) for _id in top_ten_articles]
        user_profile = get_user_profile(df_test, user_id)

        hit_ten = get_category_hit_ratio(user_profile, top_ten_subcategories)
        ndcg_ten = get_ndcgs_category(user_profile, top_ten_subcategories)
        
        hit_five = get_category_hit_ratio(user_profile, top_ten_subcategories[:5])
        ndcg_five = get_ndcgs_category(user_profile, top_ten_subcategories[:5])
        
        hits_ten.append(hit_ten)
        ndcgs_ten.append(ndcg_ten)
        hits_five.append(hit_five)
        ndcgs_five.append(ndcg_five)
        counter += 1
    return np.average(hits_ten), np.average(ndcgs_ten), np.average(hits_five), np.average(ndcgs_five)
        
        
category_hits_ten, category_ndcg_ten,category_hits_five,category_ndcg_five   = predict_all_users_arc3(df_test, model_arc3)


In [None]:
print(category_hits_ten)
print(category_ndcg_ten)
print(category_hits_five)
print(category_ndcg_five)

In [None]:
write_category_results("arc3", category_hits_ten, category_ndcg_ten, category_hits_five, category_ndcg_five)

# 4.5 Architecture 4

In [None]:
def evaluate_one_rating_arc4(model, user_id, user_profiles, all_articles,user_clicks, true_item, categories, sub_categories):
    ### Reshaping to make it on the right shape ###
    #expanded_user_id = np.array([user_id]*100).reshape((100,1))
    all_articles = np.array(all_articles).reshape(-1,1)
    
    # predictions
    #user_history, profile_input, item_input,category_input, subcategory_input
    predictions = model.predict([user_clicks, user_profiles, all_articles, categories, sub_categories]) #TODO: add categories, sub_cat
    predicted_labels = np.squeeze(predictions)
    top_ten_items = [all_articles[i] for i in np.argsort(predicted_labels)[::-1][0:10].tolist()]
    hr = getHitRatio(top_ten_items, true_item)
    ndcg = getNDCG(top_ten_items, true_item)
    hr_five = getHitRatio(top_ten_items[:5], true_item)
    ndcg_five = getNDCG(top_ten_items[:5], true_item)
    return hr, ndcg,hr_five,ndcg_five

def evalaute_model_arc4(model, df_test, userid_to_true_item):
    print("Evaluate model")
    hits = []
    ndcgs = []
    hits_five = []
    ndcgs_five = []
    users = df_test["user_id"].unique()
    for user_id in tqdm(users):
        user_df = df_test[df_test["user_id"] == user_id] # get the 100 samples for this user
        true_item = userid_to_true_item[user_id] # get the actual true item in the test set
        all_articles = user_df["article_id"].values.astype("int64") # get all possible articles
        user_profiles = user_df.iloc[:, 1:7].values.astype("int64")# get the user_profile
        user_clicks = user_df.iloc[:, 10:].values.astype("int64")
        categories = user_df.iloc[:, 8].values.astype("int64")
        sub_categories = user_df.iloc[:, 9].values.astype("int64")
        
        ht, ndcg, hr_five, ndcg_five = evaluate_one_rating_arc4(model, 
                                       user_id, 
                                       user_profiles, 
                                       all_articles,user_clicks, 
                                       true_item, 
                                       categories, 
                                       sub_categories)
        hits.append(ht)
        ndcgs.append(ndcg)
        hits_five.append(hr_five)
        ndcgs_five.append(ndcg_five)
    return hits, ndcgs, hits_five,ndcgs_five

In [None]:
#@tf.autograph.experimental.do_not_convert
def get_model_arc4(num_users, num_items, dims,num_categories,num_sub_categories, dense_layers=[128, 64, 32, 8]):
    #User features
    user_history = Input(shape=(30,), name="user")
    user_profile_input = Input(shape=(6,), name="profile")
    #item features
    item_input = Input(shape=(1,), name="item")
    item_category = Input(shape=(1,), name="category")
    item_subcategory = Input(shape=(1,), name="subcategory")
    
    # User emb
    click_history_emb = Embedding(output_dim=dims, input_dim=num_items+1, input_length=30, name="mf_user_emb")(user_history)
    profile_emb = Embedding(output_dim=dims, input_dim=num_unique_categories, input_length=6, name="mf_profile_emb")(user_profile_input)
    
    # Item emb
    item_emb = Embedding(output_dim=dims, input_dim=num_items+1, input_length=1, name="mf_item_emb")(item_input)
    category_emb = Embedding(output_dim=dims, input_dim=num_categories, input_length=1, name="cat_emb")(item_category)
    subcategory_emb = Embedding(output_dim=dims, input_dim=num_sub_categories, input_length=1, name="subcat_emb")(item_subcategory)
    
    lstm_tower = Concatenate(axis=1)([click_history_emb,item_emb])
    mlp_tower = Concatenate(axis=1)([profile_emb,category_emb, subcategory_emb])
    mlp_tower = Flatten()(mlp_tower)
    # Lstm-tower
    lstm_tower = LSTM(40)(lstm_tower)
    lstm_tower = Dropout(0.8)(lstm_tower)
    lstm_tower = BatchNormalization(axis=1)(lstm_tower)
    
    # MLP tower
    mlp_tower = Dense(2)(mlp_tower)
    mlp_tower = Dropout(0.2)(mlp_tower)
    mlp_tower = BatchNormalization(axis=1)(mlp_tower)
    
    # Click predictor
    final = Concatenate()([lstm_tower,mlp_tower ])
    final = BatchNormalization(axis=1)(final)
    y = Dense(1, activation="sigmoid")(final)
    
    
    model = Model(inputs=[user_history, user_profile_input, item_input, item_category, item_subcategory], outputs=y)
    model.compile(
        optimizer=Adam(0.01),
        loss="binary_crossentropy",
        metrics=["accuracy"],
    )
    return model

model_arc4 = get_model_arc4(num_users, num_items, dims, num_categories,num_sub_categories)

In [None]:
###### Training ########
user_history = df_train.iloc[:, 11:].values.astype("int64")
profile_input = df_train.iloc[:, 1:7].values.astype("int64")
item_input = df_train.iloc[:, 7].values.reshape((-1,1)).astype("int64")
labels = df_train.iloc[:, 10].values.reshape((-1,1)).astype("int64")
category_input = df_train.iloc[:, 8].values.reshape((-1,1)).astype("int64")
subcategory_input = df_train.iloc[:, 9].values.reshape((-1,1)).astype("int64")
print(user_history.shape,profile_input.shape, item_input.shape, labels.shape )

In [None]:
all_user_ids = merged["user_id"].unique()

#user_input = df_train.iloc[:, 0].values.reshape((-1,1))
#profile_input = df_train.iloc[:, 1:6].values
#item_input = df_train.iloc[:, 7].values.reshape((-1,1))
#labels = df_train.iloc[:, 8].values.reshape((-1,1))

train_loss = []
val_loss = []
train_acc = []
val_acc = []

hits_list = []
ndcg_list = []
best_hits = 0
best_ndcgs = 0
best_hits_five = 0
best_ndcgs_five = 0

epochs=2
for epoch in range(epochs):
    hist = model_arc4.fit([user_history, profile_input, item_input,category_input, subcategory_input ], labels, epochs=1,validation_split=0.1, shuffle=True, verbose=1)    
    train_loss.append(hist.history["loss"])
    train_acc.append(hist.history["accuracy"])
    val_loss.append(hist.history["val_loss"])
    val_acc.append(hist.history["val_accuracy"])
    
    hits, ndcgs, hits_five, ndcgs_five = evalaute_model_arc4( model_arc4, df_test, userid_to_true_item)
    hits_list.append(np.average(hits))
    ndcg_list.append(np.average(ndcgs))
    
    temp_hits = np.average(hits)
    temp_ndcgs = np.average(ndcgs)
    if (temp_hits > best_hits):
        best_hits = temp_hits
        best_ndcgs = temp_ndcgs
        best_hits_five = np.average(hits_five)
        best_ndcgs_five = np.average(ndcgs_five)
    

In [None]:
print("Hit @ 10: {:.2f}".format(best_hits))
print("ncdgs @ 10: {:.2f}".format(best_ndcgs))
print("Hit @ 5: {:.2f}".format(best_hits_five))
print("ncdgs @ 5: {:.2f}".format(best_ndcgs_five))

In [None]:
write_accuracy_results("arc4", best_hits, best_ndcgs, best_hits_five, best_ndcgs_five)

In [None]:
sns.set_style("darkgrid")
plt.plot(train_acc)
plt.plot(val_acc)
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['Train', 'Val'], loc='upper left')
plt.savefig("arc4_accuracy.pdf")
plt.show()


In [None]:
sns.set_style("darkgrid")
plt.plot(train_loss)
plt.plot(val_loss)
plt.title('Loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['Train', 'Val'], loc='upper left')
plt.savefig("arc4_loss.pdf")
plt.show()

In [None]:
sns.set_style("darkgrid")
plt.plot(hits_list)
plt.plot(train_loss)
plt.title('Hit ratio vs Loss')
plt.xlabel('epoch')
plt.legend(['Hit@10', 'Train loss'], loc='upper left')
plt.savefig("arc4_hit_loss.pdf")
plt.show()

In [None]:
def get_recommendations_arc4(user_id, df, model):
    
    ## Setup ###
    user_profile = get_user_profile(df, user_id)
    click_history = userid_to_article_history[user_id]
    display_items = df[df["user_id"] == user_id]["article_id"].values.reshape(-1, 1).astype("int64")
    user_profile = np.tile(user_profile, display_items.shape[0]).reshape(-1, 6).astype("int64")
    category, sub_category = get_item_features(user_id)
    user_ids = np.tile(np.array(user_id), display_items.shape[0]).reshape(-1,1).astype("int64")
    category = np.asarray(category).astype("int64")
    sub_category = np.asarray(sub_category).astype("int64")
    click_history = np.tile(click_history, display_items.shape[0]).reshape(-1, 30).astype("int64")

    ## Preds ###
    predictions = model.predict([click_history, user_profile, display_items, category, sub_category])
    predicted_labels = np.squeeze(predictions)
    top_ten_items = [display_items[i][0] for i in np.argsort(predicted_labels)[::-1][0:10].tolist()]
    return top_ten_items

def predict_all_users_arc4(df, model):
    hits_ten,ndcgs_ten = [], []
    hits_five, ndcgs_five = [], []

    counter = 0
    for user_id in tqdm(df["user_id"].unique()):
        top_ten_articles = get_recommendations(user_id, df)
        top_ten_subcategories = [get_article_subcategory(_id, df) for _id in top_ten_articles]
        user_profile = get_user_profile(df_test, user_id)

        hit_ten = get_category_hit_ratio(user_profile, top_ten_subcategories)
        ndcg_ten = get_ndcgs_category(user_profile, top_ten_subcategories)
        
        hit_five = get_category_hit_ratio(user_profile, top_ten_subcategories[:5])
        ndcg_five = get_ndcgs_category(user_profile, top_ten_subcategories[:5])
        
        hits_ten.append(hit_ten)
        ndcgs_ten.append(ndcg_ten)
        hits_five.append(hit_five)
        ndcgs_five.append(ndcg_five)
        counter += 1
    return np.average(hits_ten), np.average(ndcgs_ten), np.average(hits_five), np.average(ndcgs_five)
        
        
category_hits_ten, category_ndcg_ten,category_hits_five,category_ndcg_five   = predict_all_users_arc4(df_test, model_arc4)

In [None]:
print(category_hits_ten)
print(category_ndcg_ten)
print(category_hits_five)
print(category_ndcg_five)

In [None]:
write_category_results("arc4", category_hits_ten, category_ndcg_ten, category_hits_five, category_ndcg_five)