In [78]:
type(tqdm)

type

In [199]:
import urllib

from collections import Counter, OrderedDict, defaultdict
from pprint import pprint

import numpy as np
import yake

from nltk.stem import PorterStemmer
from tqdm.auto import tqdm

# Extract keywords from Wikipedia text

In [89]:
top_k = 10
max_n_grams = 3

kw_extractors = [None]
kw_extractors += [yake.KeywordExtractor(n=i + 1, top=top_k) for i in range(max_n_grams)]

In [113]:
def stem_text(text, join=True):
    stemmer = PorterStemmer()
    if join:
        return " ".join([stemmer.stem(x) for x in text.split()])
    else:
        return [stemmer.stem(x) for x in text.split()]


def get_kw(text, max_ngram_size=2, top_k=10):
    return [x[0] for x in kw_extractors[max_ngram_size].extract_keywords(text)]

In [173]:
stemmer = PorterStemmer()
keywords_yake = defaultdict(lambda: {})
tks_s = defaultdict(lambda: {})
tks_s_norm = defaultdict(lambda: {})

for l in open("../data/wikipedia_texts.tsv"):
    topic, text = l.strip().split("\t", maxsplit=1)
    stemmed_text = stem_text(text)
    for i in range(1, max_n_grams + 1):
        kws = get_kw(stemmed_text, i)
        keywords_yake[i][topic] = kws
        c = Counter([x for x in stemmed_text.split() if x in kws])
        tks_s[i][topic] = np.asarray([c[x] for x in kws])
        tks_s_norm[i][topic] = normalize(np.asarray([c[x] for x in kws]).reshape(1, -1)).flatten()

# Embeddings of clicked documents

In [174]:
docs_yake_embeddings = defaultdict(lambda: {})
docs_yake_embeddings_norm = defaultdict(lambda: {})

for idx, line in tqdm(enumerate(open("../data/clicked_docs_with_topics.tsv")), total=1074):
    if idx % 100 == 0:
        print(f"{idx}/1074")
    try:
        url, topic, text = line.strip().split("\t", maxsplit=2)
    except ValueError:
        continue
    stemmed_doc = stem_text(text, join=False)
    for i in range(1, max_n_grams + 1):
        c = Counter([x for x in stemmed_doc if x in keywords_yake[i][topic]])
        docs_yake_embeddings[i][url] = np.asarray([c[x] for x in keywords_yake[i][topic]])
        docs_yake_embeddings_norm[i][url] = normalize(
            np.asarray([c[x] for x in keywords_yake[i][topic]]).reshape(1, -1)
        ).flatten()

  0%|          | 0/1074 [00:00<?, ?it/s]

0/1074
100/1074
200/1074
300/1074
400/1074
500/1074
600/1074
700/1074
800/1074
900/1074
1000/1074


# Knowledge estimation

In [175]:
import json

from scipy.spatial.distance import cosine
from scipy.stats import kendalltau, pearsonr, spearmanr
from sklearn.preprocessing import normalize

In [194]:
normalized = True
users_knowledge = []
dataset = json.load(open("../data/logs_with_position.json"))
final_knowledges = defaultdict(OrderedDict)
final_knowledges_norm = defaultdict(OrderedDict)
valid_urls = docs_yake_embeddings[i].keys()
f_dist = cosine


def trim_upper_limit(cks, tks):
    return [min(cks[i], tks[i]) for i in range(len(cks))]

In [195]:
for u in dataset:
    u_id = u["userID"]
    ALG = u["ALG"]
    RPL = u["RPL"]
    topic = urllib.parse.quote(u["topic_title"])
    cks_s = {i: [0] * len(keywords_yake[i][topic]) for i in range(1, max_n_grams + 1)}
    cks_s_norm = {i: [0] * len(keywords_yake[i][topic]) for i in range(1, max_n_grams + 1)}

    for d in u["clicks"]:
        url = d["url"]
        if url not in valid_urls:
            continue
        for i in range(1, max_n_grams + 1):
            cks_s[i] += docs_yake_embeddings[i][url]
            cks_s_norm[i] += docs_yake_embeddings_norm[i][url]
    # normalized vs un-normalized
    for i in range(1, max_n_grams + 1):
        final_knowledges[f"{i}"][u_id] = {"RPL": RPL, "ALG": ALG, "final_sim": 1 - f_dist(cks_s[i], tks_s[i][topic])}
        final_knowledges[f"{i}_norm"][u_id] = {
            "RPL": RPL,
            "ALG": ALG,
            "final_sim": 1 - f_dist(cks_s_norm[i], tks_s_norm[i][topic]),
        }

  dist = 1.0 - uv / np.sqrt(uu * vv)


In [202]:
RPLs = []
ALGs = []
u_ids = [u["userID"] for u in dataset]

pearsons_ALG = {}
spearman_ALG = {}
kendalltau_ALG = {}
corr_ALG = {}

for m in final_knowledges:
    u_ids = final_knowledges[m].keys()
    ALGs = [final_knowledges[m][u]["ALG"] for u in u_ids]
    RPLs = [final_knowledges[m][u]["RPL"] for u in u_ids]
    results = [final_knowledges[m][u]["final_sim"] for u in u_ids]

    # results = [_users[x]["final_sim"] for x in u_ids if x in _users]
    pearsons_ALG[m] = pearsonr(results, RPLs)
    spearman_ALG[m] = spearmanr(results, RPLs)

    pearsons_ALG[m] = pearsonr(results, ALGs)
    spearman_ALG[m] = spearmanr(results, ALGs)

pprint(sorted(pearsons_ALG.items(), key=lambda x: x[1], reverse=True)[:3])

[('1_norm', (0.26858660054041344, 0.00226417748319268)),
 ('1', (0.2262090952587408, 0.010549422224016981)),
 ('3_norm', (0.2048923673746877, 0.0208460277504491))]
