# Visual Pipelines for Text Analysis 

June 25, 2017 &middot; Data Intelligence Conference &middot; Capitol One

## Visual Topic Modeling 

In [1]:
import os 
import nltk 
import sklearn 

#import yellowbrick as yb
import matplotlib.pyplot as plt

# Use development version of Yellowbrick 
import sys 
sys.path.append("/Users/benjamin/Repos/ddl/yellowbrick")
import yellowbrick as yb

# Notebook specific utilities 
from utils import * 
from corpus import BaleenPickledCorpusReader

In [2]:
FIXTURES   = os.path.join(os.getcwd(), "fixtures")
ARTICLES   = os.path.join(FIXTURES, "articles")

categories = ["news", "politics"]
corpus     = BaleenPickledCorpusReader(ARTICLES)
print(corpus.describes(categories=categories))

Baleen corpus contains 1,308 files in 2 categories.
Structured as:
    25,055 paragraphs (19.155 mean paragraphs per file)
    41,659 sentences (1.663 mean sentences per paragraph).
Word count of 939,541 with a vocabulary of 41,153 (22.830 lexical diversity).
Corpus scan took 1.899251937866211 seconds.


In [3]:
from transform import * 
from sklearn.pipeline import Pipeline 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation



### Simple LDA Model 

In [4]:
model = Pipeline([
    ('norm', TextNormalizer()), 
    ('tfidf',  TfidfVectorizer(
        tokenizer=identity, preprocessor=None, lowercase=False
    )),
    ('lda', LatentDirichletAllocation(n_topics=25)),
])

In [5]:
X = documents(corpus, categories=categories)
model.fit_transform(X)



array([[ 0.00582492,  0.00582492,  0.00582492, ...,  0.00582492,
         0.00582492,  0.00582492],
       [ 0.01689608,  0.01689608,  0.01689608, ...,  0.01689608,
         0.01689608,  0.59449397],
       [ 0.00303881,  0.00303881,  0.00303881, ...,  0.00303881,
         0.00303881,  0.09972504],
       ..., 
       [ 0.00666966,  0.00666966,  0.00666966, ...,  0.00666966,
         0.00666966,  0.00666966],
       [ 0.00465198,  0.00465198,  0.00465198, ...,  0.00465198,
         0.00465198,  0.00465198],
       [ 0.00424296,  0.00424296,  0.00424296, ...,  0.00424296,
         0.00424296,  0.00424296]])

In [6]:
def topn_words(model, n=10, vectorizer=None):
    vectorizer = vectorizer or model.named_steps['tfidf']
    model  = model if hasattr(model, 'components_') else model.steps[-1][1]
    names  = vectorizer.get_feature_names()
    output = []
    
    for idx, topic in enumerate(model.components_):

        features = topic.argsort()[:-n - 1: -1]
        tokens = [names[i] for i in features]
        
        output.append("Topic #{}".format(idx)) 
        output.append(" ".join(tokens))
    
    print("\n".join(output))

In [7]:
topn_words(model)

Topic #0
× moveable fsg erades hemingway bookend rises persistently 不开吧通常发生 于是就有了做个感应小夜灯的想法
Topic #1
consul dundar crustal ionosphere cumhuriyet tulay karadeniz erdem reutersturkey adequacy
Topic #2
kepler afac ganek hsueh hedging swatch longson pauley cernobbio roubini
Topic #3
fitch biac abercrombie wool wangthe bauhaus aqr aeronautical nyembo mbuguje
Topic #4
s7 dinklage abaa promos antiquarian theseâ squaretrade youtubethe 317 editions
Topic #5
copycat nikon ele petapixel knockoff laneil optical lenses theirâ manfred
Topic #6
microcephaly chefâ imagesif georgesâ tang traces hengli amelia bodily infects
Topic #7
cleary kerlan cpp draghi rocketskates motorize acton praet packt ecb
Topic #8
cidra kadokawa manga retailing hbg classifiedâ anime harmonics yanai uniqlo
Topic #9
heroine eeoc qualidade primeira é katniss everdeen lua horses broughtâ
Topic #10
greenback laing contraction holliday vellitt suydam lovecraft ism markel ~~~
Topic #11
lev fairbanks iditarod levs haveâ snowâ northâ

Ideas:

- Topic Word Frequencies 
- 