In [47]:
import pandas as pd
import os
from sklearn import datasets
import numpy as np

In [48]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic {}:".format(topic_idx))
        print(" ".join([feature_names[i] for i in topic.argsort()[:-n_top_words-1:-1]]))

In [79]:
from sklearn.metrics.pairwise import cosine_distances

def score_model(model, data_vectorized):
    doc_topic_distr = model.transform(data_vectorized)
    distances = cosine_distances(doc_topic_distr)
    np.fill_diagonal(distances, np.inf)
    idx = np.argpartition(distances, 99)[:100, :] # indexes of the 100 closest docs 
    score = np.mean(newsgroups.target[idx] == newsgroups.target[None, :])
    return score

In [50]:
cats = ['rec.autos', 'rec.motorcycles',
        'sci.crypt', 'sci.electronics', 
        'sci.med', 'sci.space',
        'talk.politics.guns', 'talk.religion.misc',
        'rec.sport.baseball', 'rec.sport.hockey']

newsgroups = datasets.fetch_20newsgroups(subset = "all", categories=cats)

In [51]:
documents = newsgroups.data

In [52]:
# https://www.machinelearningplus.com/nlp/topic-modeling-python-sklearn-examples/
import re
documents = [re.sub('\S*@\S*\s?', '', doc) for doc in documents] # Remove emails
documents = [re.sub('\s+', ' ', doc) for doc in documents] # Remove newlines
documents = [re.sub("\'", "", doc) for doc in documents] # Remove single-quotes

In [53]:
import gensim
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(documents))

In [54]:
import spacy
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append(" ".join([token.lemma_ if token.lemma_ not in ['-PRON-'] else '' for token in doc if token.pos_ in allowed_postags]))
    return texts_out

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# Run in terminal: python3 -m spacy download en
nlp = spacy.load('en', disable=['parser', 'ner'])

# Do lemmatization keeping only Noun, Adj, Verb, Adverb
data_lemmatized = lemmatization(data_words, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

In [63]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(min_df=10, lowercase=True, token_pattern='[a-zA-Z0-9]{3,}',  # num chars > 3)
                             stop_words='english')
data_vectorized = vectorizer.fit_transform(data_lemmatized)

In [83]:
from sklearn.decomposition import LatentDirichletAllocation
lda = LatentDirichletAllocation(n_components=len(cats), random_state=10, 
                                n_jobs=-1, learning_method='online',
                                batch_size=128, evaluate_every = -1,
                                doc_topic_prior=.1, topic_word_prior=.01).fit(data_vectorized)

In [84]:
score_model(lda, data_vectorized)

0.5526042876755729

In [85]:
print_top_words(lda, vectorizer.get_feature_names(), 10)

Topic 0:
year science space base program work make cost good launch
Topic 1:
gun state law right weapon people government use firearm control
Topic 2:
key use chip encryption clipper government public security information message
Topic 3:
space earth mission orbit organization subject line sun satellite post
Topic 4:
bike organization line subject write dod post article host nntp
Topic 5:
medical patient disease health use doctor cancer drug test service
Topic 6:
say god people write know subject article make christian good
Topic 7:
line subject organization car use write post article host good
Topic 8:
write article just line subject say think organization people know
Topic 9:
game team line play subject organization good win player year


In [59]:
from sklearn.model_selection import GridSearchCV

search_params = {'doc_topic_prior': [0.001, 0.1, 0.5, 1], 'topic_word_prior': [.0001, .001, .01, 1]}
tuned_model = GridSearchCV(LatentDirichletAllocation(n_components=len(cats), batch_size=128, evaluate_every=-1, 
                                                     learning_method='online', n_jobs=-1, random_state=10), 
                                                     param_grid=search_params)
tuned_model.fit(data_vectorized)
tuned_model.best_params_



{'doc_topic_prior': 1, 'topic_word_prior': 0.01}

In [60]:
score_model(tuned_model, data_vectorized)

0.4583028830921956