In [1]:
import numpy as np
def average_precision_at_k(theta, labels, k, dist_fn):
    dist = dist_fn(theta)
    np.fill_diagonal(dist, np.inf)
    idx = np.argpartition(dist, k - 1)[:k, :]
    return np.mean(labels[idx] == labels[None, :])

In [2]:
a = np.random.rand(10)
b = a[np.argpartition(a, 3)]
np.around(b,2)

array([0.09, 0.01, 0.15, 0.17, 0.23, 0.43, 0.46, 0.6 , 0.65, 0.81])

In [3]:
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances

def hellinger(theta):
    return euclidean_distances(np.sqrt(theta))

def dot(theta):
    return 1 - np.dot(theta, theta.transpose())

In [25]:
from numba import jit, prange, autojit

In [31]:
@jit(nopython = True, parallel = True)
def outer(X, fn):
    dist = np.empty((X.shape[0], X.shape[0]), dtype = np.float32)
    for i in prange(X.shape[0]):
        for j in prange(i):
            dist[j,i] = dist[i,j] = fn(X[i,:], X[j,:])           
    return dist

In [29]:
a = np.random.rand(1000, 32).astype(np.float32)

In [7]:
%%timeit
dist = np.empty((a.shape[0], a.shape[0]), dtype = np.float32)
for i in range(a.shape[0]):
    for j in range(i):
        dist[i,j] = dist[j,i] = np.dot(a[i,:], a[j,:])

494 ms ± 16.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [8]:
@jit(nopython = True)
def _dot(x,y):
    return np.dot(x,y)

In [30]:
%%timeit
dist = outer(a, _dot)

20.9 ms ± 137 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [10]:
%%timeit
np.dot(a, a.transpose())

1.48 ms ± 24.6 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [11]:
@jit(nopython = True)
def kl(p, q):
    return np.dot(p, np.log(q / p))

In [12]:
@jit(nopython = True)
def sym_kl(p, q):
    return kl(p, q) + kl(q, p)

In [13]:
@jit(nopython = True)
def js(p, q):
    m = (p + q) / 2
    return kl(p, m) / 2 + kl(q, m) / 2

In [14]:
@jit(nopython = True)
def overlap(p, q):
    return 1 - np.sum(np.minimum(p,q))

In [15]:
@jit(nopython = True)
def l1_norm(p, q):
    return np.sum(np.abs(p - q))

In [16]:
@jit(nopython = True)
def bc(p, q):
    return -np.log(np.sum(np.sqrt(p * q)))

In [17]:
dist_fns = {'cosine' : cosine_distances,
            'eucl' : euclidean_distances,
            'hellinger' : hellinger,
            'dot' : dot,
            'sym_kl' : lambda x: outer(x, sym_kl),
            'js' : lambda x: outer(x, js),
            'overlap': lambda x: outer(x, overlap),
            'l1' : lambda x: outer(x, l1_norm),
            'bc' : lambda x: outer(x, bc)}

In [18]:
def eval_topic_model(model, labels, k = 100, dist_fns = dist_fns):
    theta = model.get_theta()
    theta = np.asarray(theta, dtype = np.float32)
    theta = theta.transpose()
    best_score = 0
    best_dist = None
    for (dist, f) in dist_fns.items():
        score = average_precision_at_k(theta, labels, 100, f)
        if score > best_score:
            best_score, best_dist = score, dist
    return best_score, best_dist

In [19]:
from sklearn.datasets import fetch_20newsgroups
cats = ['rec.autos', 'rec.motorcycles',
        'sci.crypt', 'sci.electronics', 
        'sci.med', 'sci.space',
        'talk.politics.guns', 'talk.religion.misc',
        'rec.sport.baseball', 'rec.sport.hockey']
newsgroups = fetch_20newsgroups(subset = 'all', categories=cats)

In [20]:
import spacy
model = spacy.load('en', disable = ['parser','ner'])

def tokenize(string):
    return [w.lemma_ for w in model(string) if w.pos_ not in {'SPACE','PUNCT'}]

In [21]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(min_df = 10, max_df = 0.3, tokenizer = tokenize, lowercase = False)
tdm = vectorizer.fit_transform(newsgroups['data']).transpose()
labels = newsgroups['target']

In [22]:
import artm
bv = artm.BatchVectorizer(data_format = 'bow_n_wd', n_wd = tdm, vocabulary = vectorizer.get_feature_names())

In [23]:
seed = 123
for num_topics in range(5, 60 + 1,5):
    model = artm.ARTM(num_topics = num_topics, dictionary = bv.dictionary, cache_theta = True, seed = seed)
    model.fit_offline(bv, num_collection_passes = 100)
    score, dist = eval_topic_model(model, labels)
    print("%2i topics AP@100 = %.3f (%s distance)" % (num_topics, score, dist))

 5 topics AP@100 = 0.491 (js distance)
10 topics AP@100 = 0.594 (js distance)
15 topics AP@100 = 0.521 (js distance)
20 topics AP@100 = 0.471 (sym_kl distance)
25 topics AP@100 = 0.539 (js distance)
30 topics AP@100 = 0.558 (js distance)
35 topics AP@100 = 0.563 (js distance)
40 topics AP@100 = 0.617 (js distance)
45 topics AP@100 = 0.505 (js distance)
50 topics AP@100 = 0.513 (js distance)
55 topics AP@100 = 0.475 (js distance)
60 topics AP@100 = 0.438 (bc distance)


In [24]:
for num_topics in range(5, 60 + 1,5):
    model = artm.LDA(num_topics = num_topics, dictionary = bv.dictionary, cache_theta = True, seed = seed)
    model.fit_offline(bv, num_collection_passes = 100)
    score, dist = eval_topic_model(model, labels)
    print("%2i topics AP@100 = %.3f (%s distance)" % (num_topics, score, dist))

 5 topics AP@100 = 0.532 (js distance)
10 topics AP@100 = 0.592 (js distance)
15 topics AP@100 = 0.540 (sym_kl distance)
20 topics AP@100 = 0.545 (sym_kl distance)
25 topics AP@100 = 0.609 (js distance)
30 topics AP@100 = 0.583 (js distance)
35 topics AP@100 = 0.507 (js distance)
40 topics AP@100 = 0.575 (sym_kl distance)
45 topics AP@100 = 0.602 (sym_kl distance)
50 topics AP@100 = 0.668 (sym_kl distance)
55 topics AP@100 = 0.638 (sym_kl distance)
60 topics AP@100 = 0.561 (sym_kl distance)
