In [85]:
import json
import pickle
import re
import urllib

from collections import Counter, OrderedDict, defaultdict
from pprint import pprint

import numpy as np
import yake

from nltk import ngrams
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from scipy.spatial.distance import cosine
from scipy.stats import kendalltau, pearsonr, spearmanr
from sklearn.preprocessing import normalize
from tqdm.auto import tqdm

# Extract keywords from Wikipedia text

In [25]:
top_k = 10
max_n_grams = 3

kw_extractors = [None]
kw_extractors += [yake.KeywordExtractor(n=i + 1, top=top_k * 2) for i in range(max_n_grams)]

In [136]:
stop_words = set(stopwords.words('english'))
    
def stem_text(text):
    stemmer = PorterStemmer()
    if isinstance(text, list):
        return [stemmer.stem(x) for x in text]
    return [stemmer.stem(x) for x in text.split()]


def get_kw(text, max_ngram_size=2, return_scores=False):
    if return_scores:
        return kw_extractors[max_ngram_size].extract_keywords(text)

    return [x[0] for x in kw_extractors[max_ngram_size].extract_keywords(text)]


def get_all_n_grams(text, n_max=1):
    all_grams = []
    for n in range(1, n_max + 1):
        all_grams += [" ".join(x) for x in ngrams(text, n=n)]
    return all_grams


def clean_text_yake(text):
    c_text =  text.lower().replace("\t\n", " ")
    c_text = re.sub(r"[^ \w+]", "", c_text)
    tokens = word_tokenize(c_text)
    return c_text

def clean_text(text):
    stemmer = PorterStemmer()
    c_text = text.lower().replace("-", " ")
    # c_text = text.lower().replace("\t\n", " ")  # lowecase, remove tabs and new lines
    c_text = re.sub(r"[^ \w+]", "", c_text)  # remove punctuations and non-alpha

    tokens = word_tokenize(c_text)
    filtered_sentence = [stemmer.stem(w) for w in tokens if w not in stop_words]
    return filtered_sentence


def wordcount(text):
    text = text.lower()
    text  = text.replace("-", " ")
    text = re.sub("[^\w ]", "", text)
    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(text)
    stemmer = PorterStemmer()
    filtered_sentence = ""
    for w in word_tokens:
        if w not in stop_words:
            filtered_sentence = filtered_sentence + " " + stemmer.stem(w)
    words = filtered_sentence.split(" ")
    return Counter(words)

In [97]:
keywords_yake = defaultdict(lambda: {})
tks_s = defaultdict(lambda: {})
tks_s_norm = defaultdict(lambda: {})

remove_sw = True #IF True, Dima mode. If false, my

for l in open("../data/wikipedia_texts.tsv"):
    topic, text = l.strip().split("\t", maxsplit=1)

    if not remove_sw:
        all_n_grams = get_all_n_grams(stem_text(clean_text_yake(text)))
        
    for i in n_grams_to_consider:
        kws = get_kw(clean_text_yake(text), i)  # Get keywords with YAKE
        kws = stem_text(" ".join(kws))  # Stem keywords
        kws = list(dict.fromkeys(kws))[:10]  # Remove duplicates, get only the top-10
        assert len(set(kws)) == 10
        
        if remove_sw:
            counter_word = wordcount(text)
            # c = Counter(clean_text(text))
        else:
            c = Counter([x for x in all_n_grams if x in kws])  # Count occurences within the text
            assert len((set(kws)).difference(set(c.keys()))) == 0  # Make sure there is no 0s here
        tks_s[i][topic] = np.asarray([c[x] for x in kws])  # Ensure order of "embedding" and save in dict
        tks_s_norm[i][topic] = normalize(np.asarray([c[x] for x in kws]).reshape(1, -1)).flatten()  # Normalize
        keywords_yake[i][topic] = kws  # store kws in a dict

# Embeddings of clicked documents

In [116]:
url = "https://www.health.harvard.edu/newsletter_article/Noise-induced_hearing_loss"

In [124]:
docs_yake_embeddings[i][url]

array([7, 0, 0, 1, 4, 1, 0, 0, 1, 2])

In [168]:
docs_yake_embeddings = defaultdict(lambda: {})
docs_yake_embeddings_dima = defaultdict(lambda: {})
# docs_yake_embeddings_norm = defaultdict(lambda: {})
i = 3
for idx, line in tqdm(enumerate(open("../data/clicked_docs_with_topics.tsv")), total=1074):
    url, topic, text = line.strip().split("\t", maxsplit=2)
    all_n_grams = get_all_n_grams(stem_text(clean_text_yake(text)))
    c = Counter([x for x in all_n_grams if x in keywords_yake[i][topic]])
    my_embedding = [c[x] for x in keywords_yake[i][topic]]
    docs_yake_embeddings[i][url] = my_embedding
    
    c = Counter([x for x in clean_text(text) if x in keywords_yake[i][topic]])
    vect = []
    for n in keywords_yake[i][topic]:
        occ = int((c[n]))
        vect.append(occ)
    
    docs_yake_embeddings_dima[i][url] = vect
    if vect != embeddings_dima[url]:
        print(url)
        print(embeddings_dima[url]) 
        print(vect)
        print(docs_yake_embeddings[i][url])
                               
    continue
    if idx > 1:
        break
    if not remove_sw: # My mode
        all_n_grams = get_all_n_grams(stem_text(clean_text(text)))

    for i in n_grams_to_consider:
        if remove_sw:
            c = Counter(clean_text(text))
        else:
            c = Counter([x for x in all_n_grams if x in keywords_yake[i][topic]])
        docs_yake_embeddings[i][url] = np.asarray([c[x] for x in keywords_yake[i][topic]])
        # docs_yake_embeddings_norm[i][url] = normalize(docs_yake_embeddings[i][url].reshape(1, -1)).flatten()

  0%|          | 0/1074 [00:00<?, ?it/s]

http://www.moentcenter.com/library/4036/Noise-InducedHearingLossInChildren.html
[13, 1, 0, 0, 11, 4, 0, 0, 0, 6]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
https://www.amboss.com/us/knowledge/Hearing_loss
[68, 0, 3, 0, 52, 0, 2, 0, 0, 8]
[3, 0, 0, 0, 3, 0, 0, 0, 0, 0]
[3, 0, 0, 0, 3, 0, 0, 0, 0, 0]
https://www.health.harvard.edu/newsletter_article/Noise-induced_hearing_loss
[7, 0, 0, 1, 4, 1, 0, 0, 0, 2]
[7, 0, 0, 1, 4, 1, 0, 0, 1, 2]
[7, 0, 0, 1, 4, 1, 0, 0, 1, 2]
https://www.economywatch.com/us-subprime/impact-europe.html
[0, 9, 0, 0, 9, 4, 2, 0, 0, 2]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
https://www.starkey.com/blog/articles/2016/12/tinnitus-tied-to-hearing-loss
[11, 1, 0, 0, 6, 0, 0, 0, 0, 3]
[12, 1, 0, 0, 6, 0, 0, 0, 0, 3]
[12, 1, 0, 0, 6, 0, 0, 0, 0, 3]
https://www.amazon.com/Austrian-School-Business-Cycle-Theory/dp/131222827X
[0, 0, 1, 3, 3, 0, 0, 0, 0, 0]
[0, 0, 1, 8, 7, 0, 0, 0, 0, 0]
[0, 0, 1, 8, 7, 0, 0, 0, 0, 0]
http://www.nationa

In [145]:
embeddings_dima = pickle.load(open("dima_emb.pkl", "rb"))

# Knowledge estimation

In [196]:
normalized = True
users_knowledge = []
dataset = json.load(open("../data/logs_with_position.json"))
final_knowledges = defaultdict(OrderedDict)
final_knowledges_norm = defaultdict(OrderedDict)
valid_urls = docs_yake_embeddings[i].keys()
f_dist = cosine


def trim_upper_limit(cks, tks):
    return np.asarray([min(cks[i], tks[i]) for i in range(len(cks))])

In [197]:
embeddings_wiki_dima = pickle.load(open("dima_wiki_emb.pkl", 'rb'))

In [198]:
# doc_embeddings = docs_yake_embeddings_dima[3]
# doc_embeddings = docs_yake_embeddings[3]
doc_embeddings = embeddings_dima

targets = tks_s[3]
targets = embeddings_wiki_dima

for u in tqdm(dataset):
    u_id = u["userID"]
    ALG = u["ALG"]
    RPL = u["RPL"]
    topic = urllib.parse.quote(u["topic_title"])
    cks_s = {i: np.asarray([0] * len(keywords_yake[i][topic])) for i in n_grams_to_consider}
    # cks_s_norm = {i: [0] * len(keywords_yake[i][topic]) for i in n_grams_to_consider}

    cks_s_ceil = {i: np.asarray([0] * len(keywords_yake[i][topic])) for i in n_grams_to_consider}
    # cks_s_ceil_norm = {i: [0] * len(keywords_yake[i][topic]) for i in n_grams_to_consider}

    for d in u["clicks"]:
        url = d["url"]
        if url not in valid_urls:
            continue
        for i in n_grams_to_consider:
            # cks_s[i] += docs_yake_embeddings[i][url]
            # cks_s_ceil[i] += trim_upper_limit(docs_yake_embeddings[i][url], tks_s[i][topic])
            # cks_s_norm[i] += docs_yake_embeddings_norm[i][url]
            # cks_s_ceil_norm[i] += trim_upper_limit(docs_yake_embeddings_norm[i][url], tks_s_norm[i][topic])

            cks_s[i] += np.asarray(doc_embeddings[url])
            cks_s_ceil[i] += np.asarray(trim_upper_limit(doc_embeddings[url], tks_s[i][topic]))
            
            # cks_s_norm[i] += docs_yake_embeddings_norm[i][url]
            # cks_s_ceil_norm[i] += trim_upper_limit(docs_yake_embeddings_norm[i][url], tks_s_norm[i][topic])

    for i in n_grams_to_consider:
        final_knowledges[f"{i}"][u_id] = {
            "RPL": RPL,
            "ALG": ALG,
            "final_sim": 1 - f_dist(cks_s[i], tks_s[i][topic]),
        }
        # final_knowledges[f"{i}_norm"][u_id] = {
        #     "RPL": RPL,
        #     "ALG": ALG,
        #     "final_sim": 1 - f_dist(cks_s_norm[i], tks_s_norm[i][topic]),
        # }
        final_knowledges[f"{i}_ceil"][u_id] = {
            "RPL": RPL,
            "ALG": ALG,
            "final_sim": 1 - f_dist(cks_s_ceil[i], tks_s[i][topic]),
        }
        # final_knowledges[f"{i}_ceil_norm"][u_id] = {
        #     "RPL": RPL,
        #     "ALG": ALG,
        #     "final_sim": 1 - f_dist(cks_s_ceil_norm[i], tks_s_norm[i][topic]),
        # }

  0%|          | 0/127 [00:00<?, ?it/s]

In [199]:
RPLs = []
ALGs = []
u_ids = [u["userID"] for u in dataset]

pearsons_RPL = {}
corr_ALG = {}

for m in final_knowledges:
    u_ids = final_knowledges[m].keys()
    ALGs = [final_knowledges[m][u]["ALG"] for u in u_ids]
    RPLs = [final_knowledges[m][u]["RPL"] for u in u_ids]
    results = [final_knowledges[m][u]["final_sim"] for u in u_ids]

    # results = [_users[x]["final_sim"] for x in u_ids if x in _users]
    pearsons_RPL[m] = pearsonr(results, RPLs)
    # pearsons_ALG[m] = pearsonr(results, ALGs)

pprint(sorted(pearsons_RPL.items(), key=lambda x: x[1], reverse=True))

[('3', (0.311954498627866, 0.00035655478368730453)),
 ('3_ceil', (0.22128421222987416, 0.012413527912592178))]


In [200]:
pickle.dump(dict(final_knowledges), open(f"../data/Arthur_KW_Knowledges.pkl", "wb"))