In [1]:
import json
import re
import urllib

from collections import Counter, OrderedDict, defaultdict
from pprint import pprint

import numpy as np
import yake

from nltk import ngrams
from nltk.stem import PorterStemmer
from scipy.spatial.distance import cosine
from scipy.stats import kendalltau, pearsonr, spearmanr
from sklearn.preprocessing import normalize
from tqdm.auto import tqdm

# Extract keywords from Wikipedia text

In [25]:
top_k = 10
max_n_grams = 3

kw_extractors = [None]
kw_extractors += [yake.KeywordExtractor(n=i + 1, top=top_k*2) for i in range(max_n_grams)]

In [26]:
def stem_text(text):
    stemmer = PorterStemmer()
    return [stemmer.stem(x) for x in text.split()]


def get_kw(text, max_ngram_size=2, return_scores = False):
    if return_scores:
        return kw_extractors[max_ngram_size].extract_keywords(text)
        
    return [x[0] for x in kw_extractors[max_ngram_size].extract_keywords(text)]


def get_all_n_grams(text, n_max=max_n_grams):
    all_grams = []
    for n in range(1, n_max + 1):
        all_grams += [" ".join(x) for x in ngrams(text, n=n)]
    return all_grams


def clean_text(text):
    c_text = text.lower().replace("\t\n", " ")  # lowecase, remove tabs and new lines
    c_text = re.sub(r"[^ \w+]", "", c_text)  # remove punctuations and non-alpha
    return c_text

In [40]:
keywords_yake = defaultdict(lambda: {})
tks_s = defaultdict(lambda: {})
tks_s_norm = defaultdict(lambda: {})

for l in open("../data/wikipedia_texts.tsv"):
    topic, text = l.strip().split("\t", maxsplit=1)
    stemmed_text = stem_text(clean_text(text))
    all_n_grams = get_all_n_grams(stemmed_text)
    for i in range(1, max_n_grams + 1):
        kws = get_kw(clean_text(text), i)  # Get keywords with YAKE
        kws = stem_text(" ".join(kws)) # Stem keywords
        kws = list(dict.fromkeys(kws))[:10] # Remove duplicates, get only the top-10
        assert len(set(kws)) == 10
        c = Counter([x for x in all_n_grams if x in kws])  # Count occurences within the text
        assert len((set(kws)).difference(set(c.keys()))) == 0  # Make sure there is no 0s here
        tks_s[i][topic] = np.asarray([c[x] for x in kws])  # Ensure order of "embedding" and save in dict
        tks_s_norm[i][topic] = normalize(np.asarray([c[x] for x in kws]).reshape(1, -1)).flatten()  # Normalize
        keywords_yake[i][topic] = kws  # store kws in a dict

# Embeddings of clicked documents

In [41]:
docs_yake_embeddings = defaultdict(lambda: {})
docs_yake_embeddings_norm = defaultdict(lambda: {})

for idx, line in tqdm(enumerate(open("../data/clicked_docs_with_topics.tsv")), total=1074):
    if idx % 100 == 0:
        print(f"{idx}/1074")
    try:
        url, topic, text = line.strip().split("\t", maxsplit=2)
    except ValueError:
        continue
    stemmed_doc = stem_text(clean_text(text))
    all_n_grams = get_all_n_grams(stemmed_doc)
    for i in range(1, max_n_grams + 1):
        c = Counter([x for x in all_n_grams if x in keywords_yake[i][topic]])
        docs_yake_embeddings[i][url] = np.asarray([c[x] for x in keywords_yake[i][topic]])
        docs_yake_embeddings_norm[i][url] = normalize(docs_yake_embeddings[i][url].reshape(1, -1)).flatten()

  0%|          | 0/1074 [00:00<?, ?it/s]

# Knowledge estimation

In [61]:
normalized = True
users_knowledge = []
dataset = json.load(open("../data/logs_with_position.json"))
final_knowledges = defaultdict(OrderedDict)
final_knowledges_norm = defaultdict(OrderedDict)
valid_urls = docs_yake_embeddings[i].keys()
f_dist = cosine


def trim_upper_limit(cks, tks):
    return np.asarray([min(cks[i], tks[i]) for i in range(len(cks))])

In [62]:
for u in dataset:
    u_id = u["userID"]
    ALG = u["ALG"]
    RPL = u["RPL"]
    topic = urllib.parse.quote(u["topic_title"])
    cks_s = {i: [0] * len(keywords_yake[i][topic]) for i in range(1, max_n_grams + 1)}
    cks_s_norm = {i: [0] * len(keywords_yake[i][topic]) for i in range(1, max_n_grams + 1)}
    
    cks_s_ceil = {i: [0] * len(keywords_yake[i][topic]) for i in range(1, max_n_grams + 1)}
    cks_s_ceil_norm = {i: [0] * len(keywords_yake[i][topic]) for i in range(1, max_n_grams + 1)}

    for d in u["clicks"]:
        url = d["url"]
        if url not in valid_urls:
            continue
        for i in range(1, max_n_grams + 1):
            cks_s[i] += docs_yake_embeddings[i][url]
            cks_s_ceil[i] += trim_upper_limit(docs_yake_embeddings[i][url], tks_s[i][topic])
            
            cks_s_norm[i] += docs_yake_embeddings_norm[i][url]
            cks_s_ceil_norm[i] += trim_upper_limit(docs_yake_embeddings_norm[i][url], tks_s_norm[i][topic])
            
    # normalized vs un-normalized
    for i in range(1, max_n_grams + 1):
        final_knowledges[f"{i}"][u_id] = {
            "RPL": RPL,
            "ALG": ALG,
            "final_sim": 1 - f_dist(cks_s[i], tks_s[i][topic]),
        }
        final_knowledges[f"{i}_norm"][u_id] = {
            "RPL": RPL,
            "ALG": ALG,
            "final_sim": 1 - f_dist(cks_s_norm[i], tks_s_norm[i][topic]),
        }
        final_knowledges[f"{i}_ceil"][u_id] = {
            "RPL": RPL,
            "ALG": ALG,
            "final_sim": 1 - f_dist(cks_s_ceil[i], tks_s[i][topic]),
        }
        final_knowledges[f"{i}_ceil_norm"][u_id] = {
            "RPL": RPL,
            "ALG": ALG,
            "final_sim": 1 - f_dist(cks_s_ceil_norm[i], tks_s_norm[i][topic]),
        }

In [63]:
RPLs = []
ALGs = []
u_ids = [u["userID"] for u in dataset]

pearsons_ALG = {}
corr_ALG = {}

for m in final_knowledges:
    u_ids = final_knowledges[m].keys()
    ALGs = [final_knowledges[m][u]["ALG"] for u in u_ids]
    RPLs = [final_knowledges[m][u]["RPL"] for u in u_ids]
    results = [final_knowledges[m][u]["final_sim"] for u in u_ids]

    # results = [_users[x]["final_sim"] for x in u_ids if x in _users]
    pearsons_ALG[m] = pearsonr(results, RPLs)
    pearsons_ALG[m] = pearsonr(results, ALGs)

pprint(sorted(pearsons_ALG.items(), key=lambda x: x[1], reverse=True))

[('3', (0.3039136843607386, 0.0005135112025245985)),
 ('3_norm', (0.2844631123878325, 0.0011899694757848593)),
 ('1_norm', (0.23197175163536946, 0.00868428296746364)),
 ('3_ceil_norm', (0.22758595200716975, 0.010074383187389579)),
 ('3_ceil', (0.2195274428474504, 0.013145019476534403)),
 ('2', (0.2156041447125589, 0.01491588031976827)),
 ('2_ceil_norm', (0.20866583166321645, 0.018559054097283884)),
 ('2_ceil', (0.20660535399405, 0.0197793958284606)),
 ('1_ceil_norm', (0.20355512535422587, 0.02171277247387708)),
 ('1_ceil', (0.19754585937546787, 0.026000567691690304)),
 ('2_norm', (0.18204687457827934, 0.040517921559252615)),
 ('1', (0.15625594584281702, 0.07937966417687231))]
