In [1]:
import pandas as pd
import os
from sklearn import datasets
import numpy as np

In [2]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic {}:".format(topic_idx))
        print(" ".join([feature_names[i] for i in topic.argsort()[:-n_top_words-1:-1]]))

In [12]:
from sklearn.metrics.pairwise import cosine_distances

def score_model(model, data_vectorized):
    doc_topic_distr = model.transform(data_vectorized)
    distances = cosine_distances(doc_topic_distr)
    np.fill_diagonal(distances, np.inf)
    idx = np.argpartition(distances, 99)[:100, :] # indexes of the 100 closest docs 
    score = np.mean(newsgroups.target[idx] == newsgroups.target[None, :])
    return score

In [4]:
cats = ['sci.med', 'sci.space',
        'talk.politics.guns', 'talk.religion.misc',
        'rec.sport.baseball', 'rec.sport.hockey']

newsgroups = datasets.fetch_20newsgroups(subset = "all", categories=cats)

In [5]:
documents = newsgroups.data

In [6]:
import re
documents = [re.sub('\S*@\S*\s?', '', doc) for doc in documents] # Remove emails
documents = [re.sub('\s+', ' ', doc) for doc in documents] # Remove newlines
documents = [re.sub("\'", "", doc) for doc in documents] # Remove single-quotes

In [7]:
import gensim
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(documents))

In [8]:
import spacy
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append(" ".join([token.lemma_ if token.lemma_ not in ['-PRON-'] else '' for token in doc if token.pos_ in allowed_postags]))
    return texts_out

nlp = spacy.load('en', disable=['parser', 'ner'])
data_lemmatized = lemmatization(data_words, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

In [9]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(min_df=10, lowercase=True, token_pattern='[a-zA-Z0-9]{3,}',  # num chars > 3)
                             stop_words='english')
data_vectorized = vectorizer.fit_transform(data_lemmatized)

In [10]:
from sklearn.decomposition import LatentDirichletAllocation
lda = LatentDirichletAllocation(n_components=len(cats), random_state=10, 
                                n_jobs=-1, learning_method='online',
                                batch_size=128,
                                doc_topic_prior=0.4, topic_word_prior=0.001).fit(data_vectorized)

In [13]:
score_model(lda, data_vectorized)

0.5503848946986202

In [14]:
print_top_words(lda, vectorizer.get_feature_names(), 10)

Topic 0:
line subject organization write post article good game year host
Topic 1:
use medical subject disease organization study patient line year food
Topic 2:
space nasa orbit launch earth mission satellite line moon organization
Topic 3:
write say subject god people line make article organization just
Topic 4:
team game play hockey win goal season player playoff nhl
Topic 5:
gun people write line right say subject article organization think
