## Рекомендация статей с помощью тематических моделей

In [1]:
from scipy.special import softmax
from collections import OrderedDict
from gensim.models.ldamodel import LdaModel, CoherenceModel
from gensim.corpora.dictionary import Dictionary
from nltk.corpus import stopwords
from collections import Counter
from sklearn import metrics
import numpy as np
import tqdm

In [None]:
arxiv_tokens = OrderedDict()
stop_words = set(stopwords.words('english'))
with open('data/arxiv_plain.txt', 'r') as f:
    for line in tqdm.tqdm(f):
        cur_tokens = line.split()
        arxiv_tokens[cur_tokens[0]] = list(filter(lambda token: token not in stop_words, cur_tokens[1:]))
arxiv_titles = list(arxiv_tokens.keys())

In [None]:
arxiv_dictionary = Dictionary(list(arxiv_tokens.values()))
arxiv_corpus = [arxiv_dictionary.doc2bow(text) for text in list(arxiv_tokens.values())]

In [None]:
lda = LdaModel(arxiv_corpus, num_topics=30)

In [None]:
lda.num_topics

In [None]:
lda.get_document_topics(arxiv_corpus[0])

In [None]:
theta = {}
for doc_title, doc_bow in tqdm.tqdm(zip(arxiv_titles, arxiv_corpus)):
    topic_vector = np.zeros(lda.num_topics)
    for topic_num, topic_prob in lda.get_document_topics(doc_bow):
        topic_vector[topic_num] = topic_prob
    theta[doc_title] = topic_vector

Тематический вектор статьи с номером 0704.0004:

In [None]:
theta['0704.0004']

Теперь для того, чтобы порекомендовать читателю близкие по смыслу статьи, достаточно выбрать метрику близости и сравнить вектор текущего документа (например, последнего прочитанного) с векторами всех остальных документов в коллекции. В качестве метрики близости можно использовать косинусную меру, евклидово расстояние, расстояние Хелингера и т.д.

In [2]:
def cos_sim(first, second):
    return metrics.pairwise.cosine_similarity(first.reshape(1, -1), second.reshape(1, -1))[0][0]

def dot_sim(first, second):
    return first.dot(second)

def hel_sim(first, second): #one more sqrt and division by sqrt(2) omitted, minus added
    return -np.sum((np.sqrt(first) - np.sqrt(second)) ** 2)

def jaccard_sim(first, second):
    intersection = set(first).intersection(set(second))
    union = set(first).union(set(second))
    return float(len(intersection))/float(len(union))

In [None]:
def recommend_papers(query, theta, sim=cos_sim, top_k=10):
    query_vec = theta[query]
    ranked_list = []
    for doc_name, doc_vec in theta.items():
        ranked_list.append((doc_name, sim(query_vec, doc_vec)))
    ranked_list.sort(key=lambda x: x[1], reverse=True)
    return ranked_list[:top_k]

In [None]:
recommended_papers = recommend_papers('0704.2596', theta, top_k=5)

In [None]:
for paper_name, prob in recommended_papers:
    print(paper_name)
    print(' '.join([token[0] for token in Counter(arxiv_tokens[paper_name]).most_common(10)]))
    print()

Для оценки качества полученной рекомендательной системы воспользуемся датасетом триплетов [[Dai et al. 2015](https://arxiv.org/abs/1507.07998)]. Датасет содержит тройки статей `<запрос>|<релевантная статья>|<нерелевантная статья>`. Будем считать, что если метрика близости между запросом и релевантной статьей оказалась выше, чем между запросом и нерелевантной статьей, то такая тройка обработана "правильно".

In [3]:
def evaluate_quality(theta, sim):
    all_triplets = 0
    covered_triplets = 0
    correct_triplets = 0
    with open('data/arxiv_triplets.txt', 'r') as fin:
        for line in fin:
            ids = list(map(lambda x: x.split('/pdf/')[-1], line.split()))
            if all([x in theta.keys() for x in ids]):
                covered_triplets += 1
                vectors = [theta[x] for x in ids]
                correct_triplets += sim(vectors[0], vectors[1]) > sim(vectors[0], vectors[2])
            all_triplets += 1
    return 1.0 * correct_triplets / covered_triplets

In [None]:
evaluate_quality(theta, cos_sim)

In [None]:
evaluate_quality(theta, hel_sim)

In [None]:
evaluate_quality(theta, dot_sim)

Попробуем 300 тем

In [None]:
lda = LdaModel(arxiv_corpus, num_topics=300)

theta = {}
for doc_title, doc_bow in tqdm.tqdm(zip(arxiv_titles, arxiv_corpus)):
    topic_vector = np.zeros(lda.num_topics)
    for topic_num, topic_prob in lda.get_document_topics(doc_bow):
        topic_vector[topic_num] = topic_prob
    theta[doc_title] = topic_vector
    
print(evaluate_quality(theta, cos_sim))

### Эксперимент №2: использовать BERT-based фичи совместно с тематическими фичами

In [4]:
from transformers import BertTokenizer, BertModel, BertTokenizerFast
from gensim.matutils import Sparse2Corpus
from scipy import sparse
import torch
import pickle 

BERT: http://jalammar.github.io/illustrated-bert/

In [5]:
articles =[]
stop_words = set(stopwords.words('english'))
with open('data/arxiv_plain.txt', 'r') as f:
    for line in tqdm.tqdm(f):
        cur_tokens = line.split()
        articles.append(' '.join(list(filter(lambda token: token not in stop_words, cur_tokens[1:]))))

43091it [01:40, 428.90it/s] 


In [6]:
#articles = [' '.join(tokens) for tokens in arxiv_tokens.values()]
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

In [7]:
bert_reprs = []
for article in tqdm.tqdm(articles):
    inputs = tokenizer(article,return_tensors="pt", 
                padding='max_length', truncation=True, max_length = 512)
    outputs = model(**inputs)
    bert_representation = torch.mean(outputs.last_hidden_state, dim=1).detach().numpy() 
    bert_reprs.append(bert_representation)

100%|██████████| 43091/43091 [19:30:33<00:00,  1.63s/it]   


In [8]:
with open('bert_repr.pickle', 'wb') as f:
    pickle.dump(bert_reprs, f)

In [None]:
np.array(bert_reprs).shape

In [None]:
bert_sparse = sparse.csr_matrix(np.array(bert_reprs)[:, 0, :].T)
corpus = Sparse2Corpus(bert_sparse)
lda = LdaModel(corpus, num_topics=300, chunksize = 50000, minimum_probability=0.0)

In [None]:
arxiv_titles = []
with open('data/arxiv_plain.txt', 'r') as f:
    for line in tqdm.tqdm(f):
        cur_tokens = line.split()
        arxiv_titles.append(cur_tokens[0])

In [None]:
len(corpus)

In [None]:
theta = {}
for doc_title, doc_bow in tqdm.tqdm(zip(arxiv_titles, corpus)):
    topic_vector = np.zeros(lda.num_topics)
    for topic_num, topic_prob in lda.get_document_topics(doc_bow):
        topic_vector[topic_num] = topic_prob
    theta[doc_title] = topic_vector