In [None]:
!pwd

In [1]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import tensorflow as tf
from tqdm import tqdm
import re
import scipy
#from tensorflow import keras
from tensorflow.keras.layers import Input,Flatten, Embedding, Reshape, Multiply, Dropout, Dense, Concatenate, GlobalAveragePooling1D
from tensorflow.keras.layers import Layer, SpatialDropout1D, GlobalMaxPooling1D, Bidirectional, GRU
from tensorflow.keras.layers import Dot, TimeDistributed, BatchNormalization, multiply
from tensorflow.keras import initializers, regularizers, constraints
from tensorflow.keras import Model
from tensorflow.keras.optimizers import Adam
#import keras.backend as K
from sklearn.utils import shuffle
import seaborn as sns
import math
import pickle
import collections
from collections import Counter
import random

In [2]:
PATH = "../data/adressa_v2/"
with open(PATH + "articles_v3.bin", "rb") as f_in:
    articles = pickle.load(f_in)
# two different files: behaviors.bin and behaviors_two_days.bin
with open(PATH + "full_behaviors.bin", "rb") as f_in:
    behaviors = pickle.load(f_in)

In [3]:
print(len(behaviors["userId"].unique()))

640503


In [None]:
df_ = pd.DataFrame(np.random.randn(len(behaviors), 2))
msk = np.random.rand(len(df_)) < 0.2
print(len(behaviors))
behaviors = behaviors[msk]
print(len(behaviors))

In [None]:
behaviors.head(1)

# 1. Preprocessing

In [None]:
behaviors["time"] = pd.to_datetime(behaviors["time"], unit="s")
behaviors = behaviors.drop_duplicates(["userId", "id"])
print("before merge: ",len(behaviors))
behaviors = behaviors.drop(columns=["title"])
articles.rename(columns={"article_id": "id"}, inplace=True)
behaviors = behaviors.merge(articles, on=["id"])
print("after merge:",len(behaviors))

print("Len before removal: ",len(behaviors))
behaviors = behaviors[behaviors.groupby('userId').userId.transform('count')>2].copy()
print("Len after removal: ",len(behaviors))


user_enc = LabelEncoder()
article_enc = LabelEncoder()
behaviors["user_id"] = user_enc.fit_transform(behaviors["userId"].values)
behaviors["article_id"] = article_enc.fit_transform(behaviors["id"].values)





In [None]:
import nltk
from nltk.corpus import stopwords
# Helper functions
def _removeNonAscii(s):
    return "".join(i for i in s if  ord(i)<128)

def make_lower_case(text):
    return text.lower()

def remove_stop_words(text):
    text = text.split()
    stops = set(stopwords.words("norwegian"))
    text = [w for w in text if not w in stops]
    text = " ".join(text)
    return text

def remove_html(text):
    html_pattern = re.compile('<.*?>')
    return html_pattern.sub(r'', text)

def remove_punctuation(text):
    text = re.sub(r'[^\w\s]', '', text)
    return text

def text_to_list(text):
    text = text.split(" ")
    return text

def take_one_category(text):
    temp = text.split()
    if len(temp) > 1:
        return temp[1]
    return temp[0]

In [None]:
def clean_title(df):
    df["title_cleaned"] = df.title.apply(func = make_lower_case)
    df["title_cleaned"] = df.title_cleaned.apply(func = remove_stop_words)
    df["title_cleaned"] = df.title_cleaned.apply(func = remove_punctuation)
    return df
def hyphen_to_underline(category):
    """
    Convert hyphen to underline for the subcategories. So that Tfidf works correctly
    """
    return category.replace("-","_")
#behaviors = clean_title(behaviors)
behaviors["category_cleaned"] = behaviors["kw_category"].apply(func = take_one_category)

In [None]:
category_enc = LabelEncoder()
subcategory_enc = LabelEncoder()
behaviors["category_int"] = subcategory_enc.fit_transform(behaviors["category_cleaned"].values)


In [None]:
users = behaviors["user_id"].unique()
userid_to_profile = collections.defaultdict(list)
for user_id in tqdm(users):
    user_subcat = behaviors[behaviors["user_id"] == user_id]["category_int"].values.tolist()
    counter = Counter(user_subcat)
    s = sorted(user_subcat, key=lambda x: (counter[x], x), reverse=True)
    final_subcategories = []
    for elem in s:
        if elem not in final_subcategories:
            final_subcategories.append(elem)
    while len(final_subcategories) < 6:
        final_subcategories.append(0)
    userid_to_profile[user_id] = final_subcategories[:6]

In [None]:
profile_df = pd.DataFrame.from_dict(userid_to_profile, orient="index")
profile_df["user_id"] = profile_df.index
behaviors = behaviors.merge(profile_df, on="user_id")
behaviors = behaviors.rename(columns={"0": "p0","1": "p1","2": "p2","3": "p3","4": "p4","5": "p5",})

article_id_to_category_int = behaviors[["article_id", "category_int"]].set_index("article_id").to_dict()
article_id_to_category_int = article_id_to_category_int["category_int"]

behaviors.head(1)

# 2. Train test split

In [None]:
SEED = 42
SAMPLE_SIZE = 99
NUM_NEGATIVES = 4
ALL_ARTICLE_IDS = behaviors["article_id"].unique()

In [None]:
behaviors["article_id"].unique()

In [None]:
interactions = behaviors[["user_id", "article_id"]]
rating = [1 for i in range(len(interactions))]
interactions = interactions.assign(label=pd.Series(rating))

In [None]:
msk = np.random.rand(len(interactions)) <0.8
train = interactions[msk]
test = interactions[~msk]

In [None]:
interactions = interactions.set_index("user_id")
train = train.set_index("user_id")
test = test.set_index("user_id")

In [None]:
def negative_sampling(train_df, user_id, article_id):
    """
    Negative sample training instance; for each positive instance, add 4 negative articles
    
    Return user_ids, news_ids, category_1, category_2, authors_onehotencoded, titles
    """
    
    users, articles, labels = [], [], []
    user_item_set = set(zip(train_df.index.values, train_df[article_id].values))
    for (u,i) in user_item_set:
        for _ in range(NUM_NEGATIVES):
            negative_item = np.random.choice(ALL_ARTICLE_IDS)
            while (u, negative_item) in user_item_set:
                negative_item = np.random.choice(ALL_ARTICLE_IDS)
            users.append(u)
            articles.append(negative_item)
            labels.append(0)
        users.append(u)
        articles.append(i)
        labels.append(1)
    
    users, articles, labels = shuffle(users, articles, labels, random_state=0)
    return users[:40000], articles[:40000], labels[:40000]

train_users, train_articles, train_labels = negative_sampling(train, "user_id", "article_id")

In [None]:
train_df = pd.DataFrame(list(zip(train_users, train_articles, train_labels)), columns=["user_id", "article_ids", "label"])

In [None]:

def get_items_interacted(user_id, interactions_df=behaviors):
    interacted_items = interactions_df.loc[user_id]["article_id"]
    return set(interacted_items if type(interacted_items) == pd.Series else [interacted_items])

def get_not_interacted(user_id, interactions_df=behaviors):
    interacted_items = get_items_interacted(user_id, interactions_df)
    all_items = set(behaviors["article_id"])
    not_interacted_items = all_items - interacted_items
    random.seed(SEED)
    not_interacted_items = random.sample(not_interacted_items, SAMPLE_SIZE)
    return not_interacted_items

In [None]:
num_users = len(behaviors["user_id"].unique())
num_items = len(behaviors["article_id"].unique())
dims = 20
def get_model_neumf(num_users, num_items, dims, dense_layers=[128, 64, 32, 8]):
    user_input = Input(shape=(1,), name="user")
    item_input = Input(shape=(1,), name="item")
    
    mf_user_emb = Embedding(output_dim=dims, 
                            input_dim=num_users, 
                            input_length=1, 
                            embeddings_initializer='he_normal', 
                            embeddings_regularizer=regularizers.l2(0.001),
                            name="mf_user_emb")(user_input)
    mf_item_emb = Embedding(output_dim=dims, 
                            input_dim=num_items, 
                            input_length=1, 
                            embeddings_initializer='he_normal', 
                            embeddings_regularizer=regularizers.l2(0.001),
                            name="mf_item_emb")(item_input)
    
    num_layers = len(dense_layers)
    mlp_user_emb = Embedding(output_dim=int(dense_layers[0] / 2), 
                             input_dim=num_users, 
                             input_length=1, 
                             embeddings_initializer='he_normal', 
                             embeddings_regularizer=regularizers.l2(0.001),
                             name="mlp_user_emb")(user_input)
    mlp_item_emb = Embedding(output_dim=int(dense_layers[0] / 2), 
                             input_dim=num_items, 
                             input_length=1, 
                             embeddings_initializer='he_normal', 
                             embeddings_regularizer=regularizers.l2(0.001),
                             name="mlp_user_item")(item_input)
    
    # Matrix factorization
    mf_user_vecs = Reshape([dims])(mf_user_emb)
    mf_item_vecs = Reshape([dims])(mf_item_emb)
    
    mf_vec = multiply([mf_user_vecs, mf_item_vecs])
    
    #MLP
    mlp_vec = Concatenate()([mlp_user_emb, mlp_item_emb])
    mlp_vector = Flatten()(mlp_vec)
    
    for num_nodes in dense_layers:
        l = Dense(num_nodes, activation="relu")
        mlp_vector = l(mlp_vector)
    
    y = Concatenate()([mf_vec, mlp_vector])
    y = Dense(1, activation="sigmoid", name="pred")(y)
    
    
    model = Model(inputs=[user_input, item_input], outputs=y)
    model.compile(
        optimizer=Adam(0.01),
        loss="binary_crossentropy",
        metrics=["accuracy"],
    )
    return model

model_neumf = get_model_neumf(num_users, num_items, dims)

In [None]:
users_input, articles_input, labels_input = np.array(train_users).reshape(-1,1), np.array(train_articles).reshape(-1,1), np.array(train_labels).reshape(-1,1)

In [None]:
all_user_ids = train.index.unique().values

#user_input = df_train.iloc[:, 0].values.reshape((-1,1))
#profile_input = df_train.iloc[:, 1:6].values
#item_input = df_train.iloc[:, 7].values.reshape((-1,1))
#labels = df_train.iloc[:, 8].values.reshape((-1,1))


train_loss = []
val_loss = []
train_acc = []
val_acc = []

hits_list = []
ndcg_list = []
best_hits = 0
best_ndcgs = 0
best_hits_five = 0
best_ndcgs_five = 0

epochs=4
for epoch in range(epochs):
    hist = model_neumf.fit([users_input, articles_input], labels_input, epochs=1, shuffle=True, verbose=1, batch_size=32)
    
    train_loss.append(hist.history["loss"])
    train_acc.append(hist.history["accuracy"])
    #val_loss.append(hist.history["val_loss"])
    #val_acc.append(hist.history["val_accuracy"])
    
    #hits, ndcgs, hits_five, ndcgs_five = evalaute_model_neumf( model_neumf, df_test, userid_to_true_item)
    #hits_list.append(np.average(hits))
    #ndcg_list.append(np.average(ndcgs))
    
    #temp_hits = np.average(hits)
    #temp_ndcgs = np.average(ndcgs)
    #if (temp_hits > best_hits):
    #    best_hits = temp_hits
    #    best_ndcgs = temp_ndcgs
    #    best_hits_five = np.average(hits_five)
    #    best_ndcgs_five = np.average(ndcgs_five)

In [None]:
test_users = test.index.values[:1000]
test_items = test.article_id.values[:1000]
test_set = zip(test_users, test_items)
hits = []
for (u,i) in tqdm(test_set):
    not_interacted_items = get_not_interacted(u)
    users = np.array([u]*100).astype(int)
    items = np.array([i] + not_interacted_items)
    np.random.shuffle(items)
    #items = random.sample(items, len(items))
    predictions = model_neumf.predict([users, items])
    predicted_labels = np.squeeze(predictions)
    print(i)
    print(items)
    top_ten_items = [items[k] for k in np.argsort(predicted_labels)[::-1][0:10].tolist()]
    
    if i in top_ten_items:
        hits.append(1)
    else:
        hits.append(0)
print(np.average(hits))

In [None]:
u = test.index.values[0]
i = test.article_id.values[0]

not_interacted_items = get_not_interacted(u)
users = np.array([u]*100)
items = np.array([i] + not_interacted_items)


In [None]:
np.random.shuffle(items)

In [None]:
items

In [None]:
predictions = model_neumf.predict([users, items])
predicted_labels = np.squeeze(predictions)
top_ten_items = [items[k] for k in np.argsort(predicted_labels)[::-1][0:10].tolist()]

In [None]:
users_input.shape

In [None]:
test_users = test.index.values
test_items = test.article_id.values

In [None]:
len(test_users)