In [32]:
import json
import pickle
import re
import urllib

from collections import Counter, OrderedDict, defaultdict
from pprint import pprint

import numpy as np
import yake

from nltk import ngrams
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from scipy.spatial.distance import cosine
from scipy.stats import kendalltau, pearsonr, spearmanr
from sklearn import preprocessing as skp
from tqdm.auto import tqdm
from utils import rawcount

# Extract keywords from Wikipedia text

In [33]:
top_k = 10
max_n_grams = 3

kw_extractor = yake.KeywordExtractor(n=max_n_grams, top=top_k * 2)

In [34]:
stop_words = set(stopwords.words("english"))

def normalize(v):
    return skp.normalize(v.reshape(1, -1)).flatten()
    

def stem_text(text):
    stemmer = PorterStemmer()
    if isinstance(text, list):
        return [stemmer.stem(x) for x in text]
    return [stemmer.stem(x) for x in text.split()]


def get_kw(text, return_scores=False):
    if return_scores:
        return kw_extractor.extract_keywords(text)

    return [x[0] for x in kw_extractor.extract_keywords(text)]


def get_all_n_grams(text, n_max=1):
    all_grams = []
    for n in range(1, n_max + 1):
        all_grams += [" ".join(x) for x in ngrams(text, n=n)]
    return all_grams


def clean_text_yake(text):
    c_text = text.lower().replace("\t\n", " ")
    c_text = re.sub(r"[^ \w+]", "", c_text)
    tokens = word_tokenize(c_text)
    return c_text


def clean_text(text):
    stemmer = PorterStemmer()
    c_text = text.lower().replace("-", " ")
    # c_text = text.lower().replace("\t\n", " ")  # lowecase, remove tabs and new lines
    c_text = re.sub(r"[^ \w+]", "", c_text)  # remove punctuations and non-alpha

    tokens = word_tokenize(c_text)
    filtered_sentence = [stemmer.stem(w) for w in tokens if w not in stop_words]
    return filtered_sentence


def wordcount(text):
    text = text.lower()
    text = text.replace("-", " ")
    text = re.sub("[^\w ]", "", text)
    stop_words = set(stopwords.words("english"))
    word_tokens = word_tokenize(text)
    stemmer = PorterStemmer()
    filtered_sentence = ""
    for w in word_tokens:
        if w not in stop_words:
            filtered_sentence = filtered_sentence + " " + stemmer.stem(w)
    words = filtered_sentence.split(" ")

    return Counter(words)

In [35]:
keywords_yake = {}
tks_s = {}
tks_s_norm = {}

tks_s_no_sw = {}
tks_s_no_sw_norm = {}

remove_sw = False  # If True, Dima mode. If false, my

for l in open("../data/wikipedia_texts.tsv"):
    topic, text = l.strip().split("\t", maxsplit=1)

    kws = get_kw(clean_text_yake(text))  # Get keywords with YAKE
    kws = stem_text(" ".join(kws))  # Stem keywords
    kws = list(dict.fromkeys(kws))[:10]  # Remove duplicates, get only the top-10
    assert len(set(kws)) == 10

    # c_no_sw = {k:v for k,v in wordcount(text).items() if k in kws}
    # assert len((set(kws)).difference(set(c_no_sw.keys()))) == 0  # Make sure there are no 0s here
    # tks_s_no_sw[topic] = np.asarray([c_no_sw[x] for x in kws])  # Ensure order of "embedding" and save in dict
    # tks_s_no_sw_norm[topic] = normalize(tks_s_no_sw[topic])
    
    all_n_grams = get_all_n_grams(stem_text(clean_text_yake(text)))
    c = Counter([x for x in all_n_grams if x in kws])  # Count occurences within the text
    assert len((set(kws)).difference(set(c.keys()))) == 0  # Make sure there are no 0s here
    
    tks_s[topic] = np.asarray([c[x] for x in kws])  # Ensure order of "embedding" and save in dict
    tks_s_norm[topic] = normalize(tks_s[topic])  # Normalize
    
    
    keywords_yake[topic] = kws  # store kws in a dict

# Embeddings of clicked documents

In [54]:
docs_yake_embeddings = defaultdict(lambda: {})
# docs_yake_embeddings_dima = defaultdict(lambda: {})
docs_yake_embeddings_norm = defaultdict(lambda: {})

docs_file = "../data/clicked_docs.tsv"
doc_topics = pickle.load(open("../data/doc_topics.pkl", 'rb'))

for idx, line in tqdm(enumerate(open(docs_file)), total=rawcount(docs_file)):
    
    url, text = line.strip().split("\t", maxsplit=1)
    topic = doc_topics[url]
    all_n_grams = get_all_n_grams(stem_text(clean_text_yake(text)))
    c = Counter([x for x in all_n_grams if x in keywords_yake[topic]])
    docs_yake_embeddings[url] = np.asarray([c[x] for x in keywords_yake[topic]])
    docs_yake_embeddings_norm[url] = normalize(docs_yake_embeddings[url])

  0%|          | 0/1074 [00:00<?, ?it/s]

# Knowledge estimation

In [57]:
normalized = True
users_knowledge = []
dataset = json.load(open("../data/logs_with_position.json"))
final_knowledges = defaultdict(OrderedDict)
final_knowledges_norm = defaultdict(OrderedDict)
valid_urls = docs_yake_embeddings.keys()
f_dist = cosine

def trim_upper_limit(cks, tks):
    return np.asarray([min(cks[i], tks[i]) for i in range(len(cks))])
embeddings_wiki_dima = pickle.load(open("dima_wiki_emb.pkl", "rb"))

In [60]:
# Simpler implementation, sum all around only.

missing_docs = set()

targets = tks_s

embs = docs_yake_embeddings
wiki_embs = tks_s
final_knowledges = OrderedDict()

for u in tqdm(dataset):
    u_id = u["userID"]
    ALG = u["ALG"]
    RPL = u["RPL"]
    user_knowledge = np.zeros(10)  # initialize knowledge as zeros.
    topic = urllib.parse.quote(u["topic_title"])

    for d in u["clicks"]:
        url = d["url"]
        if url not in embs:
            missing_docs.add(url)
            continue
        user_knowledge += embs[url]
        

    final_knowledges[u_id] = {"RPL": RPL, "ALG": ALG, "estimated": 1 - f_dist(user_knowledge, targets[topic])}

  0%|          | 0/127 [00:00<?, ?it/s]

In [62]:
u_ids = final_knowledges.keys()
RPLs = [final_knowledges[u]["RPL"] for u in u_ids]
results = [final_knowledges[u]["estimated"] for u in u_ids]
pearsons = pearsonr(results, RPLs)
print(pearsons)
print(f"missing docs: {len(missing_docs)}")

pickle.dump(dict(final_knowledges), open(f"../data/KW_knowledge_gains.pkl", "wb"))

(0.3085872421042087, 0.0004159256441952825)
missing docs: 33


In [198]:
# doc_embeddings = docs_yake_embeddings_dima[3]
doc_embeddings = docs_yake_embeddings
# doc_embeddings = embeddings_dima


targets = tks_s
# targets = embeddings_wiki_dima

for u in tqdm(dataset):
    u_id = u["userID"]
    ALG = u["ALG"]
    RPL = u["RPL"]
    topic = urllib.parse.quote(u["topic_title"])
    cks_s = {i: np.asarray([0] * len(keywords_yake[i][topic])) for i in n_grams_to_consider}
    # cks_s_norm = {i: [0] * len(keywords_yake[i][topic]) for i in n_grams_to_consider}

    cks_s_ceil = {i: np.asarray([0] * len(keywords_yake[i][topic])) for i in n_grams_to_consider}
    # cks_s_ceil_norm = {i: [0] * len(keywords_yake[i][topic]) for i in n_grams_to_consider}

    for d in u["clicks"]:
        url = d["url"]
        if url not in valid_urls:
            continue
        for i in n_grams_to_consider:

            cks_s[i] += np.asarray(doc_embeddings[url])
            cks_s_ceil[i] += np.asarray(trim_upper_limit(doc_embeddings[url], tks_s[i][topic]))

            # cks_s_norm[i] += docs_yake_embeddings_norm[i][url]
            # cks_s_ceil_norm[i] += trim_upper_limit(docs_yake_embeddings_norm[i][url], tks_s_norm[i][topic])

    for i in n_grams_to_consider:
        final_knowledges[f"{i}"][u_id] = {
            "RPL": RPL,
            "ALG": ALG,
            "final_sim": 1 - f_dist(cks_s[i], tks_s[i][topic]),
        }
        # final_knowledges[f"{i}_norm"][u_id] = {
        #     "RPL": RPL,
        #     "ALG": ALG,
        #     "final_sim": 1 - f_dist(cks_s_norm[i], tks_s_norm[i][topic]),
        # }
        final_knowledges[f"{i}_ceil"][u_id] = {
            "RPL": RPL,
            "ALG": ALG,
            "final_sim": 1 - f_dist(cks_s_ceil[i], tks_s[i][topic]),
        }
        # final_knowledges[f"{i}_ceil_norm"][u_id] = {
        #     "RPL": RPL,
        #     "ALG": ALG,
        #     "final_sim": 1 - f_dist(cks_s_ceil_norm[i], tks_s_norm[i][topic]),
        # }

  0%|          | 0/127 [00:00<?, ?it/s]

In [199]:
RPLs = []
ALGs = []
u_ids = [u["userID"] for u in dataset]

pearsons_RPL = {}
corr_ALG = {}

for m in final_knowledges:
    u_ids = final_knowledges[m].keys()
    ALGs = [final_knowledges[m][u]["ALG"] for u in u_ids]
    RPLs = [final_knowledges[m][u]["RPL"] for u in u_ids]
    results = [final_knowledges[m][u]["final_sim"] for u in u_ids]

    # results = [_users[x]["final_sim"] for x in u_ids if x in _users]
    pearsons_RPL[m] = pearsonr(results, RPLs)
    # pearsons_ALG[m] = pearsonr(results, ALGs)

pprint(sorted(pearsons_RPL.items(), key=lambda x: x[1], reverse=True))

[('3', (0.311954498627866, 0.00035655478368730453)),
 ('3_ceil', (0.22128421222987416, 0.012413527912592178))]


In [200]:
pickle.dump(dict(final_knowledges), open(f"../data/Arthur_KW_Knowledges.pkl", "wb"))