In [1]:
import pandas as pd
import numpy as np
import pickle
from tqdm import tqdm
import re
from collections import defaultdict
import spacy
import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords
from sklearn.preprocessing import LabelEncoder
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/eivindfalun/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Using TensorFlow backend.


In [2]:
print("Pandas version", pd.__version__)

Pandas version 1.2.3


# Import data

In [3]:
PATH = "addressa_data_preprocessed/"
with open(PATH + "articles.bin", "rb") as f_in:
    articles = pickle.load(f_in)
with open(PATH + "behaviors.bin", "rb") as f_in:
    behaviors = pickle.load(f_in)

In [4]:
print("Df articles length: ", len(articles))
print("Number unique articles: ", len(articles["article_id"].unique()))
print("Df behaviors length: ", len(behaviors))
print("Number of unique users", len(behaviors["user"].unique()))

Df articles length:  74886
Number unique articles:  74886
Df behaviors length:  159937
Number of unique users 35913


# 1. Preprocessing

## 1.1 Preprocess behaviors

In [5]:
# Helper functions
def _removeNonAscii(s):
    return "".join(i for i in s if  ord(i)<128)

def make_lower_case(text):
    return text.lower()

def remove_stop_words(text):
    text = text.split()
    stops = set(stopwords.words("norwegian"))
    text = [w for w in text if not w in stops]
    text = " ".join(text)
    return text

def remove_html(text):
    html_pattern = re.compile('<.*?>')
    return html_pattern.sub(r'', text)

def remove_punctuation(text):
    text = re.sub(r'[^\w\s]', '', text)
    return text

def text_to_list(text):
    text = text.split(" ")
    return text

In [6]:
def clean_title(df):
    df["title_cleaned"] = df.title.apply(func = make_lower_case)
    df["title_cleaned"] = df.title_cleaned.apply(func = remove_stop_words)
    df["title_cleaned"] = df.title_cleaned.apply(func = remove_punctuation)
    return df
behaviors = clean_title(behaviors)

In [7]:
behaviors["time"] = pd.to_datetime(behaviors["time"], unit="s")
behaviors["author"].fillna("null", inplace=True)

In [8]:
behaviors.drop(columns=["userFreq", "articleId"], inplace=True)
behaviors.head()

Unnamed: 0,user,userId,title,author,id,time,title_cleaned
0,13,cx:0d6120e0df4899ed1f18e5377c62644a:liav87wp9vf6,Slik blir ferieåret 2017,frank lervik,f2ce698b3daf00cfcac0d5279053c4da9de07a92,2017-01-01 17:07:21,ferieåret 2017
1,13,cx:0d6120e0df4899ed1f18e5377c62644a:liav87wp9vf6,Bolig totalskadd i brann,frank lervik,338d849c5c3e0a320d91a2ed2026e43e7c17f8dc,2017-01-01 08:49:47,bolig totalskadd brann
2,13,cx:0d6120e0df4899ed1f18e5377c62644a:liav87wp9vf6,Polart lavtrykk med krafig vind og tett snødre...,torsten hanssen,2f467692114ca904797b884155dc0d423b5d9c42,2017-01-01 17:06:26,polart lavtrykk krafig vind tett snødrev treff...
3,13,cx:0d6120e0df4899ed1f18e5377c62644a:liav87wp9vf6,Nødbluss sendt gjennom vindu startet branntilløp,joakim slettebak wangen,a60c0b9a0ba539404271d0d51ffd209760a42cff,2017-01-01 08:48:51,nødbluss sendt gjennom vindu startet branntilløp
4,13,cx:0d6120e0df4899ed1f18e5377c62644a:liav87wp9vf6,Hvem syns du er Årets trønder?,espen rasmussen,ac6aacb71fb09db2bb79554e6bc5ecdb95103ea2,2017-01-01 18:26:23,syns årets trønder


In [9]:
print(len(behaviors))
sub_behaviors = behaviors[behaviors.groupby('user').user.transform('count')>3].copy()
print(len(sub_behaviors))
sub_behaviors.head()

159937
125854


Unnamed: 0,user,userId,title,author,id,time,title_cleaned
0,13,cx:0d6120e0df4899ed1f18e5377c62644a:liav87wp9vf6,Slik blir ferieåret 2017,frank lervik,f2ce698b3daf00cfcac0d5279053c4da9de07a92,2017-01-01 17:07:21,ferieåret 2017
1,13,cx:0d6120e0df4899ed1f18e5377c62644a:liav87wp9vf6,Bolig totalskadd i brann,frank lervik,338d849c5c3e0a320d91a2ed2026e43e7c17f8dc,2017-01-01 08:49:47,bolig totalskadd brann
2,13,cx:0d6120e0df4899ed1f18e5377c62644a:liav87wp9vf6,Polart lavtrykk med krafig vind og tett snødre...,torsten hanssen,2f467692114ca904797b884155dc0d423b5d9c42,2017-01-01 17:06:26,polart lavtrykk krafig vind tett snødrev treff...
3,13,cx:0d6120e0df4899ed1f18e5377c62644a:liav87wp9vf6,Nødbluss sendt gjennom vindu startet branntilløp,joakim slettebak wangen,a60c0b9a0ba539404271d0d51ffd209760a42cff,2017-01-01 08:48:51,nødbluss sendt gjennom vindu startet branntilløp
4,13,cx:0d6120e0df4899ed1f18e5377c62644a:liav87wp9vf6,Hvem syns du er Årets trønder?,espen rasmussen,ac6aacb71fb09db2bb79554e6bc5ecdb95103ea2,2017-01-01 18:26:23,syns årets trønder


## 1.2 Preprocess articles

In [10]:
articles.head()

Unnamed: 0,article_id,author,body,title,url,kw_category,article,title_cleaned,category_preprocessed
0,fcc01a7a1a7f7092a2da6b9c5186fdef421c8ab6,pål solberg,Det er Trøndelag politidistrikt som klokken 1...,- Dette er ingen lekeplass,http://www.adressa.no/nyheter/sortrondelag/201...,nyheter sortrondelag,73905,lekeplass,"[nyheter, sortrondelag]"
1,e1c14c3f599c9764a003740b9959c4e6f2fbc8e3,empty,Det er Trøndelag Veteranvognklubb TVK som for...,Trondheim fylles med veteranbiler,http://www.adressa.no/bil/veteran/article80867...,bil veteran,65918,trondheim fylles veteranbiler,"[bil, veteran]"
2,6a0612e60690288a776834811004ce133f326cee,annemona grann,Historiene er nesten for utrolige og rommer e...,- Historiene er nesten for utrolige,http://www.adressa.no/kultur/2015/11/06/Histor...,kultur,30909,historiene nesten utrolige,[kultur]
3,13eb96b4cfbbc5954c54a75737afcac5ccc61779,elin fosshaug olsø,Flere bilførere reagerte med aggressiv kjørin...,Bilister aggressive mot trafikkaksjon,http://www.adressa.no/nyheter/trondheim/articl...,nyheter trondheim,5855,bilister aggressive trafikkaksjon,"[nyheter, trondheim]"
4,b40a30877124510cf65683b6c9391d927e20f89d,ann iren bævre,Under årets store interiørmesse i Milano var ...,Fyll på med småbord,http://www.adressa.no/forbruker/hjem/article15...,forbruker hjem,52530,fyll småbord,"[forbruker, hjem]"


### 1.2.1 One-hot-encode authors

In [11]:
def gen_authors_unique(df, column):
    authors = []
    
    def extract_authors(_list):
        for elem in _list:
            if isinstance(elem, list):
                extract_authors(elem)
            else:
                if elem not in authors:
                    authors.append(elem)
    extract_authors(df[column].values)
    return authors
    
authors = gen_authors_unique(articles, "author")
authors_to_id = {name: idx for idx, name in enumerate(authors)}

In [12]:
authors_list = []
for idx, val in enumerate(articles["author"].values):
    try:
        authors_list.append(authors_to_id[val])
    except:
        # Val is a list; the article is written by multiple article. I append the first author in the list
        authors_list.append(authors_to_id[val[0]])
articles["authors_onehot"] = authors_list

In [13]:
num_authors = len(authors_to_id)
print("There are: ", num_authors, " unique authors")

There are:  6319  unique authors


### 1.2.2 Tokenize titles

In [14]:
titles = articles["title_cleaned"]
title_tokenizer = Tokenizer()
title_tokenizer.fit_on_texts(titles)
titles_to_num = title_tokenizer.texts_to_sequences(titles)
maxlen=300
vocab_size = len(title_tokenizer.word_index)  + 1
titles_to_num[0]

[4647]

In [15]:
reverse_word_map = dict(map(reversed, title_tokenizer.word_index.items()))
reverse_word_map[0] = ""

In [16]:
max_len = 6
padding = 0
#titles_arr = np.zeros((len(titles_to_num), max_len))

for i in range(len(titles_to_num)):
    if len(titles_to_num[i]) < max_len:
        while len(titles_to_num[i]) < max_len:
            titles_to_num[i].append(0)
    else:
        titles_to_num[i] = titles_to_num[i][:max_len]
titles_to_num




[[4647, 0, 0, 0, 0, 0],
 [1, 6236, 15565, 0, 0, 0],
 [6237, 94, 4648, 0, 0, 0],
 [1351, 11799, 23537, 0, 0, 0],
 [3467, 23538, 0, 0, 0, 0],
 [1626, 5610, 3718, 0, 0, 0],
 [2896, 436, 23539, 295, 0, 0],
 [53, 23540, 2897, 2741, 0, 0],
 [5090, 5611, 0, 0, 0, 0],
 [215, 15566, 1431, 0, 0, 0],
 [3051, 2065, 42, 901, 948, 0],
 [365, 7053, 536, 0, 0, 0],
 [3468, 23541, 0, 0, 0, 0],
 [76, 1042, 23542, 0, 0, 0],
 [409, 244, 46, 8140, 1884, 0],
 [4, 377, 1688, 1, 0, 0],
 [536, 9582, 4, 11800, 0, 0],
 [528, 11801, 0, 0, 0, 0],
 [730, 23543, 997, 1043, 11802, 0],
 [325, 106, 3469, 15567, 4649, 998],
 [1885, 21, 165, 1383, 3470, 0],
 [23544, 567, 0, 0, 0, 0],
 [1886, 23545, 15568, 0, 0, 0],
 [780, 391, 23546, 352, 0, 0],
 [1197, 133, 2469, 0, 0, 0],
 [59, 8141, 9583, 23547, 1229, 23548],
 [681, 321, 5612, 10, 180, 227],
 [11803, 126, 3242, 6238, 1523, 0],
 [228, 7054, 11804, 3243, 23549, 0],
 [100, 7, 11805, 23550, 1, 1689],
 [1806, 23551, 0, 0, 0, 0],
 [4295, 15569, 162, 2742, 0, 0],
 [6, 11806, 

In [17]:
assert len(max(titles_to_num, key=len)) == max_len


In [18]:
articles["title_tokenized"] = titles_to_num

In [19]:
articles.head()

Unnamed: 0,article_id,author,body,title,url,kw_category,article,title_cleaned,category_preprocessed,authors_onehot,title_tokenized
0,fcc01a7a1a7f7092a2da6b9c5186fdef421c8ab6,pål solberg,Det er Trøndelag politidistrikt som klokken 1...,- Dette er ingen lekeplass,http://www.adressa.no/nyheter/sortrondelag/201...,nyheter sortrondelag,73905,lekeplass,"[nyheter, sortrondelag]",0,"[4647, 0, 0, 0, 0, 0]"
1,e1c14c3f599c9764a003740b9959c4e6f2fbc8e3,empty,Det er Trøndelag Veteranvognklubb TVK som for...,Trondheim fylles med veteranbiler,http://www.adressa.no/bil/veteran/article80867...,bil veteran,65918,trondheim fylles veteranbiler,"[bil, veteran]",1,"[1, 6236, 15565, 0, 0, 0]"
2,6a0612e60690288a776834811004ce133f326cee,annemona grann,Historiene er nesten for utrolige og rommer e...,- Historiene er nesten for utrolige,http://www.adressa.no/kultur/2015/11/06/Histor...,kultur,30909,historiene nesten utrolige,[kultur],2,"[6237, 94, 4648, 0, 0, 0]"
3,13eb96b4cfbbc5954c54a75737afcac5ccc61779,elin fosshaug olsø,Flere bilførere reagerte med aggressiv kjørin...,Bilister aggressive mot trafikkaksjon,http://www.adressa.no/nyheter/trondheim/articl...,nyheter trondheim,5855,bilister aggressive trafikkaksjon,"[nyheter, trondheim]",3,"[1351, 11799, 23537, 0, 0, 0]"
4,b40a30877124510cf65683b6c9391d927e20f89d,ann iren bævre,Under årets store interiørmesse i Milano var ...,Fyll på med småbord,http://www.adressa.no/forbruker/hjem/article15...,forbruker hjem,52530,fyll småbord,"[forbruker, hjem]",4,"[3467, 23538, 0, 0, 0, 0]"


### 1.2.3 Tokenize categories

In [20]:
# Fix categories from [nyheter, sport] to:[ [1] [3] ]

# extract categories
def get_categories(df, column):
    categories = []
    
    def extract_category(_list):
        for elem in _list:
            if isinstance(elem, list):
                extract_category(elem)
            else:
                if elem not in categories:
                    categories.append(elem)
    extract_category(df[column].values)
    return categories

unique_categories = get_categories(articles, "category_preprocessed") 
category_to_id = {category: idx + 1 for idx, category in enumerate(unique_categories)}
id_to_category = {idx: category for idx, category in category_to_id.items()}
category_to_id["0"] = 0
id_to_category[0] = "0"

def onehotencode_category(df, column, max_len, padding):
    i = 0
    for i in tqdm(range(max_len)):
        df[column + "_" + str(i)] = df[column].apply(lambda x: x[i] if i < len(x) else padding)
    #df.drop(column, axis=1, inplace=True)
    return df

articles = onehotencode_category(articles, "category_preprocessed", 2, 0)

def encode_categories(df):
    for i in tqdm(range(len(df))):
        df["category_preprocessed_0"].iloc[i] = category_to_id[df["category_preprocessed_0"].iloc[i]]
        df["category_preprocessed_1"].iloc[i] = category_to_id[str(df["category_preprocessed_1"].iloc[i])]
    return df
articles = encode_categories(articles)

100%|██████████| 2/2 [00:00<00:00, 41.94it/s]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
100%|██████████| 74886/74886 [02:01<00:00, 614.73it/s]


In [21]:
articles.head(1)

Unnamed: 0,article_id,author,body,title,url,kw_category,article,title_cleaned,category_preprocessed,authors_onehot,title_tokenized,category_preprocessed_0,category_preprocessed_1
0,fcc01a7a1a7f7092a2da6b9c5186fdef421c8ab6,pål solberg,Det er Trøndelag politidistrikt som klokken 1...,- Dette er ingen lekeplass,http://www.adressa.no/nyheter/sortrondelag/201...,nyheter sortrondelag,73905,lekeplass,"[nyheter, sortrondelag]",0,"[4647, 0, 0, 0, 0, 0]",1,2


### 1.2.4 Preprocess bodies

In [22]:
def clean_bodies(df):
    df["body_cleaned"] = df.body.apply(func = make_lower_case)
    df["body_cleaned"] = df.body_cleaned.apply(func = remove_stop_words)
    df["body_cleaned"] = df.body_cleaned.apply(func = remove_punctuation)
    return df
articles = clean_bodies(articles)

In [23]:
articles.head(1)

Unnamed: 0,article_id,author,body,title,url,kw_category,article,title_cleaned,category_preprocessed,authors_onehot,title_tokenized,category_preprocessed_0,category_preprocessed_1,body_cleaned
0,fcc01a7a1a7f7092a2da6b9c5186fdef421c8ab6,pål solberg,Det er Trøndelag politidistrikt som klokken 1...,- Dette er ingen lekeplass,http://www.adressa.no/nyheter/sortrondelag/201...,nyheter sortrondelag,73905,lekeplass,"[nyheter, sortrondelag]",0,"[4647, 0, 0, 0, 0, 0]",1,2,trøndelag politidistrikt klokken 15 50 melder ...


## 1.1 Preprocessing specific to neural mf

In [24]:
# Rename id to article_id and merge behaviors with articles to get more article features
mf = sub_behaviors.copy() # true state: sub_behaviors -> behaviors 
mf.rename(columns={"id": "article_id"}, inplace=True)
mf = mf.merge(articles[["article_id", "category_preprocessed", "authors_onehot", "title_tokenized", "category_preprocessed_0", "category_preprocessed_1", "body_cleaned"]], on="article_id")

### 1.1.2 One-hot-encode categories

### 1.1.3  Tokenize titles

### 1.1.4 Author to num

### 1.1.3 Set all instances to 1 (e.g. true)

In [25]:
# Set all clicks equal to 1 (e.g. true labels)
rating = [1 for i in range(len(mf))]
mf["rating"] = rating

### 1.1.4 Clean up - fix ids

In [26]:
article_enc = LabelEncoder()
user_enc = LabelEncoder()
mf["user"] = user_enc.fit_transform(mf["user"].values)
mf["news_id"] = article_enc.fit_transform(mf["article_id"].values)

## 1.2 Train test split

In [27]:
mf["rank_latest"] = mf.groupby(["user"])["time"].rank(method="first", ascending=False)
mf_train = mf[mf['rank_latest'] != 1]
mf_test = mf[mf['rank_latest'] == 1]
mf_train.head(1)

Unnamed: 0,user,userId,title,author,article_id,time,title_cleaned,category_preprocessed,authors_onehot,title_tokenized,category_preprocessed_0,category_preprocessed_1,body_cleaned,rating,news_id,rank_latest
0,0,cx:0d6120e0df4899ed1f18e5377c62644a:liav87wp9vf6,Slik blir ferieåret 2017,frank lervik,f2ce698b3daf00cfcac0d5279053c4da9de07a92,2017-01-01 17:07:21,ferieåret 2017,"[nyheter, sortrondelag]",16,"[28069, 523, 0, 0, 0, 0]",1,2,norge ti bevegelige helligdager falle hverdage...,1,1125,4.0


In [28]:
assert mf["news_id"].max() + 1 == len(mf["news_id"].unique())

In [29]:
assert mf["user"].max() + 1 == len(mf["user"].unique())
print(mf["user"].max(), len(mf["user"].unique()))

21668 21669


## 1.3 Negative sampling

### 1.3.1 Negative samling for two-tower model with category-feature

In [30]:
news_id_to_cat0 = dict(zip(mf.news_id, mf.category_preprocessed_0))
news_id_to_cat1 = dict(zip(mf.news_id, mf.category_preprocessed_1))

In [31]:
print(len(mf_train["user"].unique()))
print(len(mf["user"].unique()))
print(len(mf_test["user"].unique()))

21669
21669
21669


In [None]:
def negative_sample_two_tower_model(df, all_article_ids, all_user_ids):
    """
    Return categories...
    """
    
    users, articles, category_1, category_2, labels = [], [], [], [],[]
    user_item_set = set(zip(df["user"], df["news_id"], df["category_preprocessed_0"], df["category_preprocessed_1"]))
    num_negatives = 4

    for (u, i, c_0, c_1) in tqdm(user_item_set):
        users.append(u)
        articles.append(i)
        category_1.append(c_0)
        category_2.append(c_1)
        labels.append(1)
        for _ in range(num_negatives):
            negative_item = np.random.choice(all_article_ids)
            while (u, negative_item, c_0, c_1) in user_item_set:
                negative_item = np.random.choice(all_article_ids)
            users.append(u)
            articles.append(negative_item)
            
            category_1.append(news_id_to_cat0[negative_item])
            category_2.append(news_id_to_cat1[negative_item])
            labels.append(0)
    return np.asarray(users), np.asarray(articles), np.asarray(category_1), np.asarray(category_2), np.asarray(labels)

all_article_ids_train = mf_train["news_id"].unique()
all_user_ids_train = mf_train["user"].unique()

#users, news_articles, category_1, category_2,labels = negative_sample_two_tower_feature_model(mf_train, all_article_ids_train, all_user_ids_train)

In [None]:

#assert len(users) == len(news_articles) == len(category_1) == len(category_2) == len(labels)

### 1.3.2 Negative sampling with multiple features

In [None]:
mf_train.head(1)

In [32]:
news_id_to_cat0 = dict(zip(mf.news_id, mf.category_preprocessed_0))
news_id_to_cat1 = dict(zip(mf.news_id, mf.category_preprocessed_1))
newsid_to_author = dict(zip(mf.news_id, mf.authors_onehot))
newsid_to_title = dict(zip(mf.news_id, mf.title_tokenized))


In [33]:
def negative_sample_two_tower_feature_model(df, all_article_ids, all_user_ids):
    """
    Return user_ids, news_ids, category_1, category_2, authors_onehotencoded, titles
    """
    
    users, articles, category_1, category_2, authors, titles, labels = [], [], [], [],[], [], []
    user_item_set = set(zip(df["user"], 
                            df["news_id"], 
                            df["category_preprocessed_0"], 
                            df["category_preprocessed_1"], 
                            df["authors_onehot"]))
    num_negatives = 4

    for (u, i, c_0, c_1, author) in tqdm(user_item_set):
        users.append(u)
        articles.append(i)
        category_1.append(c_0)
        category_2.append(c_1)
        authors.append(author)
        titles.append(newsid_to_title[i])
        labels.append(1)
        for _ in range(num_negatives):
            negative_item = np.random.choice(all_article_ids)
            while (u, negative_item, c_0, c_1) in user_item_set:
                negative_item = np.random.choice(all_article_ids)
            users.append(u)
            articles.append(negative_item)
            
            category_1.append(news_id_to_cat0[negative_item])
            category_2.append(news_id_to_cat1[negative_item])
            authors.append(newsid_to_author[negative_item])
            titles.append(newsid_to_title[negative_item])
            labels.append(0)
    return np.asarray(users), np.asarray(articles), np.asarray(category_1), np.asarray(category_2), np.asarray(authors),titles, np.asarray(labels)

all_article_ids_train = mf_train["news_id"].unique()
all_user_ids_train = mf_train["user"].unique()

users, news_articles, category_1, category_2, train_authors, train_titles, labels = negative_sample_two_tower_feature_model(mf_train, all_article_ids_train, all_user_ids_train)

100%|██████████| 90960/90960 [00:02<00:00, 38962.65it/s]


In [34]:
#for i in range()
#train_titles[0] = np.array(train_titles[0])
#y=numpy.array([numpy.array(xi) for xi in x])
test_ = []
for i in range(len(train_titles)):
    
    test_.append(train_titles[i][:6])
        
    
        
train_titles = np.array([np.array(xi) for xi in test_])
train_titles

array([[ 3098,   300,  2798,     0,     0,     0],
       [  219,  1899, 12832,  1870,     0,     0],
       [  800,    53,   111,  4264,  7796,     0],
       ...,
       [13620,   534,   177,  3437,   364,  1967],
       [  101, 25717,   928,     0,     0,     0],
       [14945,     1,  2404,  5283,     7, 21251]])

In [35]:
mf.head()

Unnamed: 0,user,userId,title,author,article_id,time,title_cleaned,category_preprocessed,authors_onehot,title_tokenized,category_preprocessed_0,category_preprocessed_1,body_cleaned,rating,news_id,rank_latest
0,0,cx:0d6120e0df4899ed1f18e5377c62644a:liav87wp9vf6,Slik blir ferieåret 2017,frank lervik,f2ce698b3daf00cfcac0d5279053c4da9de07a92,2017-01-01 17:07:21,ferieåret 2017,"[nyheter, sortrondelag]",16,"[28069, 523, 0, 0, 0, 0]",1,2,norge ti bevegelige helligdager falle hverdage...,1,1125,4.0
1,2,cx:1011q8udhpdp823kp4a8u6tdsd:32ov38haips8f,Slik blir ferieåret 2017,frank lervik,f2ce698b3daf00cfcac0d5279053c4da9de07a92,2017-01-01 14:18:12,ferieåret 2017,"[nyheter, sortrondelag]",16,"[28069, 523, 0, 0, 0, 0]",1,2,norge ti bevegelige helligdager falle hverdage...,1,1125,1.0
2,3,cx:101v4ycd1sfin1456ev4gyewlp:17jjqj6zeb2qt,Slik blir ferieåret 2017,frank lervik,f2ce698b3daf00cfcac0d5279053c4da9de07a92,2017-01-01 15:29:53,ferieåret 2017,"[nyheter, sortrondelag]",16,"[28069, 523, 0, 0, 0, 0]",1,2,norge ti bevegelige helligdager falle hverdage...,1,1125,5.0
3,18,cx:1095s7w0nz6973fry6x6a6ok2y:30w9pefm9xv45,Slik blir ferieåret 2017,frank lervik,f2ce698b3daf00cfcac0d5279053c4da9de07a92,2017-01-01 16:51:06,ferieåret 2017,"[nyheter, sortrondelag]",16,"[28069, 523, 0, 0, 0, 0]",1,2,norge ti bevegelige helligdager falle hverdage...,1,1125,1.0
4,20,cx:109jjai79rvwa3933affekcaxf:1htwz1j314bp2,Slik blir ferieåret 2017,frank lervik,f2ce698b3daf00cfcac0d5279053c4da9de07a92,2017-01-01 13:33:33,ferieåret 2017,"[nyheter, sortrondelag]",16,"[28069, 523, 0, 0, 0, 0]",1,2,norge ti bevegelige helligdager falle hverdage...,1,1125,6.0


# 2 Models

In [36]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Input, Embedding, Reshape, Multiply, Dropout, Dense, Concatenate, GlobalAveragePooling1D
from tensorflow.keras.layers import Layer, SpatialDropout1D, GlobalMaxPooling1D, Bidirectional, GRU
from tensorflow.keras.layers import Dot, TimeDistributed, BatchNormalization
from tensorflow.keras import initializers, regularizers, constraints
import keras.backend as K


In [37]:
# parameters
article_embedding_input_dim = len(mf["news_id"].unique())
user_embedding_input_dim = len(mf["user"].unique())
category_embedding_input_dim = len(category_to_id)
author_embedding_input_dim = len(authors_to_id) + 1
print("Article vocab size: ", article_embedding_input_dim)
print("User vocab size: ", user_embedding_input_dim)
print("Category vocab size:", category_embedding_input_dim)

Article vocab size:  1192
User vocab size:  21669
Category vocab size: 266


In [None]:
def two_tower_feature_model(article_embedding_input_dim,
                           user_embedding_input_dim,
                           category_embedding_input_dim, # TODO: fix
                           author_embedding_input_dim): # TODO: fix
    
    user_id_input = Input(shape=(1,), name="user_id")
    item_id_input = Input(shape=(1,), name="article_id")
    
    first_category = Input(shape=(1,), name="first_category")
    second_category = Input(shape=(1,), name="second_category")
    
    
    
    embedding_user_size = 20
    embedding_item_size = 20
    embedding_category_size = 20
    
    user_embedding = Embedding(input_dim=user_embedding_input_dim, output_dim=embedding_user_size, input_length=1, name="user_embedding")(user_id_input)
    item_embedding = Embedding(input_dim=article_embedding_input_dim, output_dim=embedding_item_size, input_length=1, name="item_embedding")(item_id_input)
    category_embedding = Embedding(input_dim=category_embedding_input_dim, output_dim=embedding_category_size, input_length=1, name="category_embedding")
    
    
    first_category_embedding = category_embedding(first_category)
    second_category_embedding = category_embedding(second_category)
    # reshape from (batch_size, input_length, embedding_size) to (batch_size, embedding_size)
    user_vecs = Reshape([embedding_user_size])(user_embedding)
    item_vecs = Reshape([embedding_item_size])(item_embedding)
    category_vecs_1 = Reshape([embedding_category_size])(first_category_embedding)
    category_vecs_2 = Reshape([embedding_category_size])(second_category_embedding)

    item_vecs_complete = Concatenate()([item_vecs, category_vecs_1, category_vecs_2,])
    
    input_vecs = Concatenate()([user_vecs, item_vecs_complete])
    
    x = Dense(128, activation="relu")(input_vecs)
    y = Dense(1)(x)
    
    model = keras.Model(inputs=[user_id_input, item_id_input, first_category, second_category], outputs=y)
    model.compile(
        optimizer=keras.optimizers.Adam(0.01),
        loss=tf.losses.MSE,
        metrics=["accuracy"],
    )
    
    return model

two_tower_feat = two_tower_feature_model(article_embedding_input_dim,
                                        user_embedding_input_dim,
                                        category_embedding_input_dim)

In [None]:
history_1 = two_tower_feat.fit([users, news_articles, category_1, category_2], labels, batch_size=64, epochs=3, validation_split=0.1)


In [None]:
#print(users, news_articles, category_1, category_2)

## 2.2 Model with categories and authors

In [39]:
author_embedding_input_dim = len(authors_to_id) +1
title_vocab_size = vocab_size

In [None]:
def two_tower_feature_model(article_embedding_input_dim,
                           user_embedding_input_dim,
                           category_embedding_input_dim,
                            author_embedding_input_dim,
                           title_vocab_size): #title_vocab_size
    
    user_id_input = Input(shape=(1,), name="user_id")
    item_id_input = Input(shape=(1,), name="article_id")
    
    first_category = Input(shape=(1,), name="first_category")
    second_category = Input(shape=(1,), name="second_category")
    
    authors = Input(shape=(1,), name="authors")
    titles = Input(shape=(6,), name="title")
    
    embedding_user_size = 20
    embedding_item_size = 20
    embedding_category_size = 20
    embedding_author_size = 20
    embedding_title_size = 20
    
    user_embedding = Embedding(input_dim=user_embedding_input_dim, output_dim=embedding_user_size, input_length=1, name="user_embedding")(user_id_input)
    item_embedding = Embedding(input_dim=article_embedding_input_dim, output_dim=embedding_item_size, input_length=1, name="item_embedding")(item_id_input)
    category_embedding = Embedding(input_dim=category_embedding_input_dim, output_dim=embedding_category_size, input_length=1, name="category_embedding")
    
    first_category_embedding = category_embedding(first_category)
    second_category_embedding = category_embedding(second_category)
    
    author_embedding = Embedding(input_dim=author_embedding_input_dim, output_dim=embedding_author_size)(authors)
    title_embedding = Embedding(input_dim=title_vocab_size, output_dim=embedding_title_size)(titles)
    title_embedding = GlobalAveragePooling1D()(title_embedding)
    
    # reshape from (batch_size, input_length, embedding_size) to (batch_size, embedding_size)
    user_vecs = Reshape([embedding_user_size])(user_embedding)
    item_vecs = Reshape([embedding_item_size])(item_embedding)
    category_vecs_1 = Reshape([embedding_category_size])(first_category_embedding)
    category_vecs_2 = Reshape([embedding_category_size])(second_category_embedding)
    
    author_vecs = Reshape([embedding_author_size])(author_embedding)
    title_vecs = Reshape([embedding_title_size])(title_embedding)
    

    item_vecs_complete = Concatenate()([item_vecs, 
                                        category_vecs_1, 
                                        category_vecs_2,  
                                        author_vecs, 
                                        title_vecs])
    
    input_vecs = Concatenate()([user_vecs, item_vecs_complete])
    
    x = Dense(128, activation="relu")(input_vecs)
    y = Dense(1)(x)
    
    model = keras.Model(inputs=[user_id_input, 
                                item_id_input, 
                                first_category, 
                                second_category, authors, titles], outputs=y) #titles
    model.compile(
        optimizer=keras.optimizers.Adam(0.01),
        loss=tf.losses.binary_crossentropy,
        metrics=["accuracy"],
    )
    
    return model

two_tower_feat = two_tower_feature_model(article_embedding_input_dim,
                                        user_embedding_input_dim,
                                        category_embedding_input_dim,
                                         author_embedding_input_dim,
                                        title_vocab_size)


# MSE vs bin_crossentropy: https://susanqq.github.io/tmp_post/2017-09-05-crossentropyvsmes/


In [None]:
[print(i.shape, i.dtype) for i in two_tower_feat.inputs]
[print(o.shape, o.dtype) for o in two_tower_feat.outputs]
[print(l.name, l.input_shape, l.dtype) for l in two_tower_feat.layers]

In [None]:
history_2 = two_tower_feat.fit([users, news_articles, category_1, category_2, train_authors, train_titles ], labels, batch_size=64, epochs=3, validation_split=0.1)
#train_titles

### Two tower model with attention

In [42]:
def two_tower_attention_model(article_embedding_input_dim,
                           user_embedding_input_dim,
                           category_embedding_input_dim,
                            author_embedding_input_dim,
                           title_vocab_size): #title_vocab_size
    
    user_id_input = Input(shape=(1,), name="user_id")
    item_id_input = Input(shape=(1,), name="article_id")
    
    first_category = Input(shape=(1,), name="first_category")
    second_category = Input(shape=(1,), name="second_category")
    
    authors = Input(shape=(1,), name="authors")
    titles = Input(shape=(6,), name="title")
    
    embedding_user_size = 20
    embedding_item_size = 20
    embedding_category_size = 20
    embedding_author_size = 20
    embedding_title_size = 20
    
    user_embedding = Embedding(input_dim=user_embedding_input_dim, output_dim=embedding_user_size, input_length=1, name="user_embedding")(user_id_input)
    item_embedding = Embedding(input_dim=article_embedding_input_dim, output_dim=embedding_item_size, input_length=1, name="item_embedding")(item_id_input)
    category_embedding = Embedding(input_dim=category_embedding_input_dim, output_dim=embedding_category_size, input_length=1, name="category_embedding")
    
    first_category_embedding = category_embedding(first_category)
    second_category_embedding = category_embedding(second_category)
    
    author_embedding = Embedding(input_dim=author_embedding_input_dim, output_dim=embedding_author_size)(authors)
    title_embedding = Embedding(input_dim=title_vocab_size, output_dim=embedding_title_size, name="title_embedding")(titles)
    
    rnn_outs = Bidirectional(GRU(64, return_sequences=True), name="gru")(title_embedding)
    sentence, word_scores = Attention(return_attention=True, name="attention_vec")(rnn_outs)
    fc_sentence = Dense(20, activation="relu")(sentence)
    print(fc_sentence.shape)
    #title_embedding = GlobalAveragePooling1D()(title_embedding)
    
    # reshape from (batch_size, input_length, embedding_size) to (batch_size, embedding_size)
    user_vecs = Reshape([embedding_user_size])(user_embedding)
    item_vecs = Reshape([embedding_item_size])(item_embedding)
    category_vecs_1 = Reshape([embedding_category_size])(first_category_embedding)
    category_vecs_2 = Reshape([embedding_category_size])(second_category_embedding)
    
    author_vecs = Reshape([embedding_author_size])(author_embedding)
    #title_vecs = Reshape([embedding_title_size])(title_embedding)
    

    item_vecs_complete = Concatenate()([item_vecs, 
                                        category_vecs_1, 
                                        category_vecs_2,  
                                        author_vecs, 
                                        fc_sentence])
    
    input_vecs = Concatenate()([user_vecs, item_vecs_complete])
    
    x = Dense(128, activation="relu")(input_vecs)
    y = Dense(1)(x)
    
    model = keras.Model(inputs=[user_id_input, 
                                item_id_input, 
                                first_category, 
                                second_category, authors, titles], outputs=y) #titles
    model.compile(
        optimizer=keras.optimizers.Adam(0.01),
        loss=tf.losses.binary_crossentropy,
        metrics=["accuracy"],
    )
    
    return model

two_tower_attention = two_tower_attention_model(article_embedding_input_dim,
                                        user_embedding_input_dim,
                                        category_embedding_input_dim,
                                         author_embedding_input_dim,
                                        title_vocab_size)


# MSE vs bin_crossentropy: https://susanqq.github.io/tmp_post/2017-09-05-crossentropyvsmes/


(None, 20)


In [43]:
#[print(i.shape, i.dtype) for i in two_tower_attention.inputs]
#[print(o.shape, o.dtype) for o in two_tower_attention.outputs]
#[print(l.name, l.input_shape, l.dtype) for l in two_tower_attention.layers]

In [44]:
history_3 = two_tower_attention.fit([users, news_articles, category_1, category_2, train_authors, train_titles ], labels, batch_size=64, epochs=3, validation_split=0.1)


Epoch 1/3
Epoch 2/3
Epoch 3/3


In [122]:
from IPython.core.display import display, HTML
def rgb_to_hex(rgb):
    return '#%02x%02x%02x' % rgb

def attention2color(attention_score):
    r = 255 - int(attention_score*255)
    color= rgb_to_hex((255, r, r))
    return str(color)

def visualize_attention(model, user, item, c_0, c_1, author, title, idx_to_word):
    """
    Inspire
    
    Params: 
        model: trained attention model
        test_array: array consisting of (users, news_articles, category_1, category_2, authors, titles)
        idx_to_word: map each word_index in the title to word
        
    """
    
    # a= np.expand_dims(np.array(users[0]), axis=0)
    items_to_predict = [np.expand_dims(np.array(user), 0), 
                       np.expand_dims(np.array(item), 0),
                       np.expand_dims(np.array(c_0), 0),
                       np.expand_dims(np.array(c_1), 0),
                       np.expand_dims(np.array(author), 0),
                       np.expand_dims(np.array(title), 0)]
    
    
    model_attention = keras.Model(inputs=model.input, 
                        outputs=[model.output, model.get_layer("attention_vec").output[-1]])
    label_probs, attentions = model_attention.predict(items_to_predict)
    
    idx_words = title
    decoded_text = [idx_to_word[idx] for idx in idx_words]
    
    attentions = attentions[0]
    attentions_text = (attentions - np.min(attentions)) / (np.max(attentions) - np.min(attentions)) # TODO: justify normalization
    
    token_attention_dict = {}
    for token, attention_score in zip(decoded_text, attentions_text):
        token_attention_dict[token] = attention_score
    
    print(token_attention_dict)
    html_text = "<hr><p style='font-size: large'><b>Text:  </b>"
    for token, attention in token_attention_dict.items():
        html_text += "<span style='background-color:{};'>{} <span> ".format(attention2color(attention), token)
    display(HTML(html_text))
    
            
        
    


In [125]:
#visualize_attention(two_tower_attention,2, 1125, 1,2,16,[28069, 523, 0, 0, 0, 0],reverse_word_map)

In [126]:
def generate_recommendations(model, user_id, all_article_ids):
    """
        Params: 
            model: the model
            user_id: the user we want predictions for
            all_article_ids:
            
        Return: 
            article_ids
    """
    #test_user_item_set = set(zip(mf_test["user"], mf_test["news_id"], mf_test["category_preprocessed_0"], mf_test["category_preprocessed_1"]))
    user_interacted_items = mf.groupby("user")["news_id"].apply(list).to_dict()
    
    #user_id, article_id, cat_1, cat_2, title_t = mf["user"].iloc[0], mf["news_id"].iloc[0], mf["category_preprocessed_0"].iloc[0], mf["category_preprocessed_1"].iloc[0], mf["title_tokenized"].iloc[0]
    c_0, c_1, titles, authors = [], [], [], []
    article_id = mf_test[mf_test["user"] == user_id]["news_id"].values[0]
    
    interacted_items = user_interacted_items[user_id] # user_id
    not_interacted_items = set(all_article_ids) - set(interacted_items)
    selected_not_interacted = list(np.random.choice(list(not_interacted_items),99))
    test_items = selected_not_interacted + [article_id]
    for elem in test_items:
        c_0.append(news_id_to_cat0[elem])
        c_1.append(news_id_to_cat1[elem])
        titles.append(newsid_to_title[elem])
        authors.append(newsid_to_author[elem])
    assert len(c_0) == len(c_1) == len(titles) == 100
    
    preds = model.predict([np.array([user_id]*100), # user_id
                               np.array(test_items), 
                               np.array(c_0), 
                               np.array(c_1), 
                               np.array(authors),
                               np.array(titles)])
    pred_labels = np.squeeze(preds)

    top10_items = [test_items[i] for i in np.argsort(pred_labels)[::-1][0:10].tolist()]
    
    return top10_items

recs = generate_recommendations(two_tower_attention, 2, mf["news_id"].unique())

In [127]:
def visualize_recommendations(recs, user_id):
    #TODO: there is something wrong with the attention-display and the corresponding title-recommendation
    """
        Params:
            recs: list with 10 item_ids
            user_id: the user id for which 10 recommendations is provided
        Return:
            print read-history, and vi
    """ 
    user_history = mf[mf["user"] == user_id]
    read_history = user_history["title"].values[0:4]
    news_ids_history = user_history["news_id"].values[0:4]
    print("You have read: ")
    for news_ids_history, hist in zip(news_ids_history, read_history):
        print(news_ids_history, " ", hist)
    
    print("\n-------------------\n")
    print("Top {} recommendations".format(len(recs)), "\n")
    for item in recs:
        item_df = mf[mf["news_id"] == item]
        title = item_df["title"].values[0]
        idx = item_df["news_id"].values[0]
        print(idx, title)
        
        c_0 = item_df["category_preprocessed_0"].values[0]
        c_1 = item_df["category_preprocessed_1"].values[0]
        author = item_df["authors_onehot"].values[0]
        title_tokenized = item_df["title_tokenized"].values[0]
        visualize_attention(two_tower_attention, user_id, idx, c_0, c_1, author, title_tokenized, reverse_word_map)
        
visualize_recommendations(recs, 2)

You have read: 
1125   Slik blir ferieåret 2017
740   Nødbluss sendt gjennom vindu startet branntilløp
30   På dette bildet skiller Magnus Carlsen seg ut: - Litt tilfeldig
709   Se lesernes nyttårsbilder

-------------------

Top 10 recommendations 

861 To kaldeste novemberdøgn i Trøndelag på 90 år
{'to': 1.0, 'kaldeste': 0.0034951915, 'novemberdøgn': 0.00029116444, 'trøndelag': 0.0, '90': 2.802001e-05, 'år': 2.4365224e-06}


322 - Nordmenn var mistenkt for doping av FIS
{'nordmenn': 1.0, 'mistenkt': 0.029807728, 'doping': 0.0068156435, 'fis': 0.0021248797, '': 1.10420315e-05}


885 Oppdal flyplass, Fagerhaug
{'oppdal': 1.0, 'flyplass': 0.03099891, 'fagerhaug': 0.010274553, '': 0.0}


1154 Syv sosiale medier som barna kjenner bedre enn deg
{'syv': 1.0, 'sosiale': 0.0010458861, 'medier': 9.216412e-05, 'barna': 3.8061877e-05, 'kjenner': 0.0, 'bedre': 1.2506046e-05}


332 Bestefar dømt til fem år for overgrep mot to barnebarn
{'bestefar': 1.0, 'dømt': 0.00225093, 'fem': 0.0, 'år': 5.5314467e-05, 'overgrep': 4.2038995e-05, 'to': 0.00010104109}


332 Bestefar dømt til fem år for overgrep mot to barnebarn
{'bestefar': 1.0, 'dømt': 0.00225093, 'fem': 0.0, 'år': 5.5314467e-05, 'overgrep': 4.2038995e-05, 'to': 0.00010104109}


846 E6 Hjerkinn
{'e6': 1.0, 'hjerkinn': 0.0066309557, '': 0.0}


546 Frykter at gode fiskeplasser går tapt til Salmars nye havmerd
{'frykter': 1.0, 'gode': 0.052780543, 'fiskeplasser': 0.001716885, 'går': 8.225235e-05, 'tapt': 0.0, 'salmars': 1.0304964e-06}


923 Rørlegger Mari (29) deltar i ny ekstrem-reality
{'rørlegger': 1.0, 'mari': 0.008019812, '29': 0.00021595886, 'deltar': 4.363401e-05, 'ny': 0.0, 'ekstremreality': 3.360998e-05}


828 Norges sprøeste bilrace
{'norges': 1.0, 'sprøeste': 0.017068997, 'bilrace': 0.0028344116, '': 0.0}


In [None]:
mf[mf["user"] == 2]

# Testing, validation

In [46]:
def hit_ratio_features(mf_test, mf, all_article_ids, model):
    test_user_item_set = set(zip(mf_test["user"], mf_test["news_id"], mf_test["category_preprocessed_0"], mf_test["category_preprocessed_1"]))
    user_interacted_items = mf.groupby("user")["news_id"].apply(list).to_dict()
    hits = []
    for (u,i, c_1,c_2) in tqdm(test_user_item_set):
        c_0 = []
        c_1 = []
        titles = []
        authors = []
        interacted_items = user_interacted_items[u]
        not_interacted_items = set(all_article_ids) - set(interacted_items)
        selected_not_interacted = list(np.random.choice(list(not_interacted_items),99))
        test_items = selected_not_interacted + [i]
        for elem in test_items:
            c_0.append(news_id_to_cat0[elem])
            c_1.append(news_id_to_cat1[elem])
            titles.append(newsid_to_title[elem])
            authors.append(newsid_to_author[elem])
        assert len(c_0) == len(c_1) == len(titles) == 100
        
        preds = model.predict([np.array([u]*100), 
                               np.array(test_items), 
                               np.array(c_0), 
                               np.array(c_1), 
                               np.array(authors),
                               np.array(titles)])

        pred_labels = np.squeeze(preds)

        top10_items = [test_items[i] for i in np.argsort(pred_labels)[::-1][0:10].tolist()]

        if i in top10_items:
            hits.append(1)
        else:
            hits.append(0)
    print("Hit Ratio @ 10 is {:.2f}".format(np.average(hits)))
hit_ratio_features(mf_test, mf, mf["news_id"].unique(), two_tower_attention)

  1%|          | 199/21669 [00:10<19:44, 18.12it/s]


KeyboardInterrupt: 

In [None]:
weights = two_tower_feat.get_weights()

In [None]:
category_embedding = weights[1]
len(category_embedding)

In [None]:
category_embedding.shape

In [None]:
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
category_tsne = TSNE(perplexity=20).fit_transform(category_embedding)
a = pd.DataFrame(category_tsne)
a.columns=["x", "y"]
plt.figure(figsize=(20,7))
plt.scatter(a["x"], a["y"])
plt.show()

In [None]:
a.head(30)

In [None]:
test_articles = articles.copy()
article_label_enc = LabelEncoder()
test_articles["news_id"] = article_label_enc.fit_transform(test_articles["article_id"].values)
test_articles.head()

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
import scipy

In [None]:
vectorizer = TfidfVectorizer(analyzer="word",
                            ngram_range=(1,2),
                            min_df=0.003,
                            max_df=0.5,
                            max_features=5000)

In [None]:
item_ids = mf["news_id"].unique().tolist()
tfidf_matrix = vectorizer.fit_transform(mf["title_cleaned"] + "" +mf["body_cleaned"])
tfidf_feature_names = vectorizer.get_feature_names()
tfidf_matrix

In [None]:
def get_item_profile(item_id):
    idx = item_ids.index(item_id)
    item_profile = tfidf_matrix[idx:idx+1]
    return item_profile

def get_item_profiles(ids):
    item_profiles_list = [get_item_profile(x) for x in ids]
    item_profiles = scipy.sparse.vstack(item_profiles_list)
    return item_profiles

def build_user_profile(person_id, interactions_indexed_df):
    interactions_person_df = interactions_indexed_df.loc[person_id]
    user_item_profiles = get_item_profiles(interactions_person_df["news_id"])
    
    user_profile_norm = sklearn.preprocessing.normalize(user_item_profiles)
    return user_profile_norm

def get_profile(idx):
    interactions = mf[mf["user"] == idx]
    profile = build_user_profile(idx, )

In [None]:
get_item_profiles([1,3])

In [None]:
mf[mf["user"] == 0]

In [41]:
# Util classes
class Attention(Layer):
    # this is fetched from: https://www.kaggle.com/alber8295/bigru-w-attention-visualized-for-beginners
    def __init__(self,
                 W_regularizer=None, b_regularizer=None,
                 W_constraint=None, b_constraint=None,
                 bias=True, return_attention=False,
                 **kwargs):
        self.supports_masking = True
        self.return_attention = return_attention
        self.init = initializers.get('glorot_uniform')

        self.W_regularizer = regularizers.get(W_regularizer)
        self.b_regularizer = regularizers.get(b_regularizer)

        self.W_constraint = constraints.get(W_constraint)
        self.b_constraint = constraints.get(b_constraint)

        self.bias = bias
        super(Attention, self).__init__(**kwargs)


    def build(self, input_shape):
        assert len(input_shape) == 3

        self.W = self.add_weight(shape=(input_shape[-1],),
                                 initializer=self.init,
                                 name='{}_W'.format(self.name),
                                 regularizer=self.W_regularizer,
                                 constraint=self.W_constraint)
        if self.bias:
            self.b = self.add_weight(shape=(input_shape[1],),
                                     initializer='zero',
                                     name='{}_b'.format(self.name),
                                     regularizer=self.b_regularizer,
                                     constraint=self.b_constraint)
        else:
            self.b = None

        self.built = True

    def compute_mask(self, input, input_mask=None):
        # do not pass the mask to the next layers
        return None

    def call(self, x, mask=None):
        eij = K.squeeze(K.dot(x, K.expand_dims(self.W)), axis=-1)

        if self.bias:
            eij += self.b

        eij = K.tanh(eij)

        a = K.exp(eij)

        # apply mask after the exp. will be re-normalized next
        if mask is not None:
            # Cast the mask to floatX to avoid float64 upcasting in theano
            a *= K.cast(mask, K.floatx())

        # in some cases especially in the early stages of training the sum may be almost zero
        # and this results in NaN's. A workaround is to add a very small positive number ε to the sum.
        # a /= K.cast(K.sum(a, axis=1, keepdims=True), K.floatx())
        a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())

        weighted_input = x * K.expand_dims(a)

        result = K.sum(weighted_input, axis=1)

        if self.return_attention:
            return [result, a]
        return result

    def compute_output_shape(self, input_shape):
        if self.return_attention:
            return [(input_shape[0], input_shape[-1]),
                    (input_shape[0], input_shape[1])]
        else:
            return input_shape[0], input_shape[-1]

In [None]:
def visualize_attention_from_online(model, tokenizer):
    # Make new model for output predictions and attentions
    '''
    model.get_layer('attention_vec').output:
    attention_vec (Attention)    [(None, 128), (None, 54)] <- We want (None,54) that is the word att
    '''
    model_att = Model(inputs=model.input, \
                            outputs=[model.output, model.get_layer('attention_vec').output[-1]])
    idx = np.random.randint(low = 0, high=X_te.shape[0]) # Get a random test
    tokenized_sample = np.trim_zeros(X_te[idx]) # Get the tokenized text
    label_probs, attentions = model_att.predict(X_te[idx:idx+1]) # Perform the prediction

    # Get decoded text and labels
    id2word = dict(map(reversed, tokenizer.word_index.items()))
    decoded_text = [id2word[word] for word in tokenized_sample] 
    
    # Get classification
    label = (label_probs>0.5).astype(int).squeeze() # Only one
    label2id = ['Sincere', 'Insincere']

    # Get word attentions using attenion vector
    token_attention_dic = {}
    max_score = 0.0
    min_score = 0.0
    
    attentions_text = attentions[0,-len(tokenized_sample):]
    #plt.bar(np.arange(0,len(attentions.squeeze())), attentions.squeeze())
    #plt.show();
    #print(attentions_text)
    attentions_text = (attentions_text - np.min(attentions_text)) / (np.max(attentions_text) - np.min(attentions_text))
    for token, attention_score in zip(decoded_text, attentions_text):
        #print(token, attention_score)
        token_attention_dic[token] = attention_score
        

    # Build HTML String to viualize attentions
    html_text = "<hr><p style='font-size: large'><b>Text:  </b>"
    for token, attention in token_attention_dic.items():
        html_text += "<span style='background-color:{};'>{} <span> ".format(attention2color(attention),
                                                                            token)
    #html_text += "</p><br>"
    #html_text += "<p style='font-size: large'><b>Classified as:</b> "
    #html_text += label2id[label] 
    #html_text += "</p>"
    
    # Display text enriched with attention scores 
    display(HTML(html_text))
    
    # PLOT EMOTION SCORES
    _labels = ['sincere', 'insincere']
    probs = np.zeros(2)
    probs[1] = label_probs
    probs[0] = 1- label_probs
    plt.figure(figsize=(5,2))
    plt.bar(np.arange(len(_labels)), probs.squeeze(), align='center', alpha=0.5, color=['black', 'red', 'green', 'blue', 'cyan', "purple"])
    plt.xticks(np.arange(len(_labels)), _labels)
    plt.ylabel('Scores')
    plt.show()