In [1]:
from data_load import data, target
from lsa import *
from metrics import mean_precision_at_k

# Baseline processing

In [2]:
tokenized_corpus = tokenizer(data)

In [3]:
cleaned_corpus = clean_corpus(tokenized_corpus)

In [4]:
finalized_corpus, vocab = create_dtm(cleaned_corpus)

In [5]:
vectorized_dtm = tf_idf_transformation(finalized_corpus)

In [6]:
decomposed = lsa(vectorized_dtm)

In [7]:
similar_docs = most_similar(decomposed, target)

In [8]:
print("Mean precision@10: {:.3f}".format(mean_precision_at_k(target, similar_docs)))

Mean precision@10: 0.628


# Num components and singular values tuning

In [9]:
components = [10, 100, 1000, 300, 700]

for n_comp in components:
    decomposed_vectors = lsa(vectorized_dtm, components=n_comp)
    similar_docs = most_similar(decomposed_vectors, target)
    print("Mean precision@10 with {} components : {:.3f}".format(n_comp, mean_precision_at_k(target, similar_docs)))

Mean precision@10 with 10 components : 0.675
Mean precision@10 with 100 components : 0.619
Mean precision@10 with 1000 components : 0.623
Mean precision@10 with 300 components : 0.628
Mean precision@10 with 700 components : 0.582


In [10]:
for singulars in [True, False]:
    decomposed_vectors = lsa(vectorized_dtm, components=100, use_singulars=singulars)
    similar_docs = most_similar(decomposed_vectors, target)
    print("Mean precision@10: {:.3f} if use singulars is {}".format(mean_precision_at_k(target, similar_docs),
                                                                    singulars)) 

Mean precision@10: 0.619 if use singulars is True
Mean precision@10: 0.468 if use singulars is False


# Comparison with gensim lda model

In [11]:
from gensim import models, corpora, matutils, similarities

In [12]:
vocab_gensim = corpora.Dictionary(cleaned_corpus)
bow_corpus = [vocab_gensim.doc2bow(text) for text in cleaned_corpus]

In [13]:
tfidf_model = models.TfidfModel(bow_corpus)
tfidf_corpus = tfidf_model[bow_corpus]

In [15]:
TOTAL_TOPICS = 20

lda_model_tfidf = models.LdaModel(corpus=tfidf_corpus, 
                                  id2word=vocab_gensim, 
                                  num_topics=TOTAL_TOPICS, 
                                  random_state=239)

In [16]:
sims = similarities.MatrixSimilarity(lda_model_tfidf[tfidf_corpus])

In [17]:
vecs = lda_model_tfidf[tfidf_corpus]

In [18]:
sims_matrix = sims[vecs]
np.fill_diagonal(sims_matrix, -1)
target_indices = np.argpartition(-sims_matrix, 10, axis=0)[:, :10]
most_similar_docs = target[target_indices]

In [19]:
print("Mean precision@10: {:.3f}".format(mean_precision_at_k(target, most_similar_docs)))

Mean precision@10: 0.378
