# Model Development V2
- Final model development

## Process:
- Merging Dataframes
- Multinomial Naive Bayes Classifer
- Latent Semantic Indexing
- Clustered using Kmeans
- Hierarchical Dirichlet Process
- Latent Semantic Analysis

In [46]:
import csv
import json
import pickle
from pymongo import MongoClient
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
%matplotlib inline


import nltk
import os
from nltk.corpus import stopwords
from sklearn.utils.extmath import randomized_svd

# gensim
from gensim import corpora, models, similarities, matutils
# sklearn
from sklearn import datasets
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from sklearn.neighbors import KNeighborsClassifier
import sklearn.metrics.pairwise as smp


import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [2]:
with open('nyt-model-df.pkl', 'rb') as nyt_data:
    df = pickle.load(nyt_data)

In [3]:
with open('mag-model-df.pkl', 'rb') as mag_data:
    df1 = pickle.load(mag_data)

In [4]:
print('NYT Column Names: %s' % str(list(df.columns)))
print()
print('Magazine Column Names: %s' % str(list(df1.columns)))

NYT Column Names: ['_id', 'date', 'desk', 'headline', 'id', 'lead_paragraph', 'locations', 'section', 'source', 'subjects', 'type', 'url', 'word_count']

Magazine Column Names: ['_id', 'date', 'headline', 'lead_paragraph', 'locations', 'source', 'subjects', 'url', 'word_count']


# Merge DataFrames

In [5]:
# select the relevant columns in our ratings dataset
nyt_df = df[['lead_paragraph', 'source']]
mag = df1[['lead_paragraph', 'source']]

In [11]:
# remove the duplicates and drop their index
nyt = nyt_df.drop_duplicates().reset_index(drop=True)

In [12]:
nyt.shape, mag.shape

((44203, 2), (1461, 2))

In [13]:
frames = [nyt, mag]
super_df = pd.concat(frames).reset_index(drop=True)

In [27]:
super_df = super_df.dropna()

In [28]:
docs1 = super_df['lead_paragraph'].dropna()

In [19]:
# create a list of stopwords
stopwords_set = frozenset(stopwords.words('english'))

# Update iterator to remove stopwords
class SentencesIterator(object):
    # giving 'stop' a list of stopwords would exclude them
    def __init__(self, dirname, stop=None):
        self.dirname = dirname
 
    def __iter__(self):
        # os.listdr is ALSO a generator
        for fname in os.listdir(self.dirname):
            for line in open(os.path.join(self.dirname, fname),encoding="latin-1"):
                # at each step, gensim needs a list of words
                line = line.lower().split()
                if stop:
                    outline = []                
                    for word in line:
                        if word not in stopwords_set:
                            outline.append(word)
                    yield outline
                else:
                    yield line

In [34]:
for doc in docs1:
    try:
        doc = SentencesIterator(doc.decode("utf8"))
    except:
        doc = SentencesIterator(doc)

In [35]:
docs = pd.Series.tolist(docs1)

In [36]:
tfidf = TfidfVectorizer(stop_words="english", 
                        token_pattern="\\b[a-zA-Z][a-zA-Z]+\\b", 
                        min_df=10)

tfidf_vecs = tfidf.fit_transform(docs)

## BASELINE:  Multinomial Naive Bayes Classification
- language is fundamentally different
- captures word choice

In [37]:
super_df.shape, tfidf_vecs.shape

((45661, 2), (45661, 11039))

In [38]:
# Train/Test split
X_train, X_test, y_train, y_test = train_test_split(tfidf_vecs, super_df['source'], test_size=0.33)

# Train 
nb = MultinomialNB()
nb.fit(X_train, y_train)

# Test 
nb.score(X_test, y_test)

0.92965691154024821

# LSI

In [39]:
# terms by docs instead of docs by terms
tfidf_corpus = matutils.Sparse2Corpus(tfidf_vecs.transpose())

# Row indices
id2word = dict((v, k) for k, v in tfidf.vocabulary_.items())

# This is a hack for Python 3!
id2word = corpora.Dictionary.from_corpus(tfidf_corpus, 
                                         id2word=id2word)

2017-06-02 00:25:53,061 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2017-06-02 00:25:53,317 : INFO : adding document #10000 to Dictionary(0 unique tokens: [])
2017-06-02 00:25:53,550 : INFO : adding document #20000 to Dictionary(0 unique tokens: [])
2017-06-02 00:25:53,760 : INFO : adding document #30000 to Dictionary(0 unique tokens: [])
2017-06-02 00:25:53,973 : INFO : adding document #40000 to Dictionary(0 unique tokens: [])
2017-06-02 00:25:54,315 : INFO : built Dictionary(11039 unique tokens: ['week', 'year', 'begin', 'traders', 'following']...) from 45661 documents (total 171614 corpus positions)


In [40]:
# Build an LSI space from the input TFIDF matrix, mapping of row id to word, and num_topics
# num_topics is the number of dimensions (k) to reduce to after the SVD

# Analagous to "fit" in sklearn, it primes an LSI space trained to 300-500 dimensions
lsi = models.LsiModel(tfidf_corpus, id2word=id2word, num_topics=300)

2017-06-02 00:26:11,753 : INFO : using serial LSI version on this node
2017-06-02 00:26:11,756 : INFO : updating model with new documents
2017-06-02 00:26:11,965 : INFO : preparing a new chunk of documents
2017-06-02 00:26:12,130 : INFO : using 100 extra samples and 2 power iterations
2017-06-02 00:26:12,131 : INFO : 1st phase: constructing (11039, 400) action matrix
2017-06-02 00:26:12,792 : INFO : orthonormalizing (11039, 400) action matrix
2017-06-02 00:26:15,440 : INFO : 2nd phase: running dense svd on (400, 20000) matrix
2017-06-02 00:26:16,864 : INFO : computing the final decomposition
2017-06-02 00:26:16,865 : INFO : keeping 300 factors (discarding 13.537% of energy spectrum)
2017-06-02 00:26:17,006 : INFO : processed documents up to #20000
2017-06-02 00:26:17,022 : INFO : topic #0(12.742): 0.264*"said" + 0.241*"company" + 0.220*"percent" + 0.193*"new" + 0.190*"billion" + 0.179*"year" + 0.140*"million" + 0.116*"quarter" + 0.116*"president" + 0.105*"business"
2017-06-02 00:26:17,

In [41]:
# Retrieve vectors for the original tfidf corpus in the LSI space ("transform" in sklearn)
lsi_corpus = lsi[tfidf_corpus] # pass using square brackets
# what are the values given by lsi? (topic distributions)

# ALSO, IT IS LAZY!  IT WON'T ACTUALLY DO THE TRANSFORMING COMPUTATION UNTIL ITS CALLED.  IT STORES THE INSTRUCTIONS

# Dump the resulting document vectors into a list so we can take a look
doc_vecs = [doc for doc in lsi_corpus]
doc_vecs[0] #print the first document vector for all the words

[(0, 0.15533940996385798),
 (1, -0.11443357310137645),
 (2, -0.069883434604592717),
 (3, 0.03128888602728104),
 (4, 0.0056151311184067642),
 (5, 0.037579775276381885),
 (6, 0.0056391859399586974),
 (7, -0.048841357880438904),
 (8, -0.042645190947676943),
 (9, 0.010445443745784806),
 (10, -0.0023039158865131443),
 (11, 0.0040793759449554304),
 (12, 0.054925492029007518),
 (13, 0.027194190797549728),
 (14, 0.072464145615996442),
 (15, 0.045298921333395331),
 (16, -0.12594135861077249),
 (17, -0.046037530124871232),
 (18, 0.10872538080764575),
 (19, -0.026747474875840144),
 (20, -0.015414245530867499),
 (21, -0.054761977071098492),
 (22, 0.049046991819983982),
 (23, -0.0046458094284868778),
 (24, -0.025200565702233006),
 (25, 0.045317640466476808),
 (26, 0.051311237450918062),
 (27, -0.0011872277454880325),
 (28, -0.033970921065671331),
 (29, -0.003070227617084597),
 (30, 0.10233636618325402),
 (31, 0.037529350295109787),
 (32, 0.1233000691637634),
 (33, 0.010314613717880977),
 (34, -0.00

In [42]:
# Convert the gensim-style corpus vecs to a numpy array for sklearn manipulations
nyt_lsi = matutils.corpus2dense(lsi_corpus, num_terms=300).transpose()

In [43]:
nyt_lsi.shape

(45661, 300)

In [44]:
lsi.show_topic(0)

[('said', 0.23426827028743039),
 ('president', 0.21450617969314997),
 ('new', 0.20191275942750156),
 ('obama', 0.16269203421152051),
 ('year', 0.15590669401728785),
 ('percent', 0.14311946650426094),
 ('company', 0.13020678593513726),
 ('federal', 0.11623707834974627),
 ('health', 0.11025160234956857),
 ('billion', 0.10854295129235096)]

In [45]:
# all docs by 300 topic vectors (word vectors)
pd.DataFrame(nyt_lsi).head()

# need to transform by cosine similarity
# look up if I need to change into an LDA corpus

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
0,0.155339,-0.114434,-0.069883,0.031289,0.005615,0.03758,0.005639,-0.048841,-0.042645,0.010445,...,0.01275,-0.009708,-0.017504,-0.010762,-0.031847,0.008844,-0.022516,-0.010638,0.016826,0.029
1,0.100299,0.001887,0.087996,0.039671,0.069129,0.007378,0.022367,-0.021855,0.008078,-0.006684,...,0.011057,0.007967,-0.024887,0.016316,-0.002321,-0.011468,0.007423,-0.015779,-0.004818,-0.019763
2,0.10469,-0.065565,-0.022781,0.000598,0.013961,-0.016065,-0.019311,-0.001848,-0.031518,0.006569,...,0.030651,0.000354,-0.013898,-0.00207,-0.002805,0.000152,-0.003459,0.034199,0.004971,0.0133
3,0.088339,0.001713,0.067384,0.02479,0.061572,0.027282,0.046405,-0.003955,-0.00592,0.001248,...,0.005964,-0.013618,-0.006522,-0.023858,0.002676,0.000633,0.014756,-0.016777,-0.026615,0.001741
4,0.042912,-0.023308,0.007149,0.008073,0.008589,0.003543,-0.006408,0.014612,0.00836,-0.013418,...,-0.012801,0.032441,0.006916,-0.013562,0.036257,-0.016925,0.031858,0.002256,0.016407,-0.017025


# Clustering - KMeans

In [51]:
# Convert the gensim-style corpus vecs to a numpy array for sklearn manipulations (back to docs to terms matrix)
nyt_lsi = matutils.corpus2dense(lsi_corpus, num_terms=300).transpose()
nyt_lsi.shape

(45661, 300)

In [52]:
# Create KMeans.  
kmeans = KMeans(n_clusters=3)

# Cluster
nyt_lsi_clusters = kmeans.fit_predict(nyt_lsi)

In [53]:
# Take a look.  It likely didn't do cosine distances.
print(nyt_lsi_clusters[0:50])
print("Lead Paragraph: \n" + str(df1.iloc[0:5].lead_paragraph))

[2 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 0 2 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 2]
Lead Paragraph: 
0    Elephants Run To Greet A Baby Elephant And It’...
1    Cyclist Saves Tiny Kitten On The Road And Kitt...
2    Cyclist Saves Tiny Kitten On The Road And Kitt...
3    Baskin Robbins, Dunkin’ Donuts To Remove Artif...
4    This Scientist Is Suing The Grand Canyon… For ...
Name: lead_paragraph, dtype: object


# Did an HDP
An HDP model is fully unsupervised. It can also determine the ideal number of topics it needs through posterior inference.

In [79]:
hdpmodel = models.HdpModel(corpus=tfidf_corpus, id2word=id2word)











2017-06-02 05:53:02,465 : WA

In [80]:
hdpmodel.show_topics()

[(0,
  '0.001*icing + 0.001*new + 0.001*hearing + 0.001*crap + 0.001*charges + 0.001*shining + 0.001*revered + 0.001*tight + 0.001*nintendo + 0.001*dial + 0.001*homeowner + 0.001*threats + 0.001*consequences + 0.001*mines + 0.001*lesser + 0.001*implies + 0.001*talking + 0.001*wpp + 0.001*intensified + 0.001*blogging'),
 (1,
  '0.001*said + 0.001*fat + 0.001*tumbling + 0.001*inspections + 0.001*counted + 0.001*deadline + 0.001*bonus + 0.001*little + 0.001*smooth + 0.001*steep + 0.001*meg + 0.001*stable + 0.001*assigned + 0.001*fortunes + 0.001*permit + 0.001*limbo + 0.001*cats + 0.001*cancer + 0.001*restraint + 0.000*inch'),
 (2,
  '0.001*black + 0.001*creating + 0.001*startling + 0.001*supporter + 0.001*slate + 0.001*renegotiate + 0.001*garden + 0.001*dem + 0.001*peaceful + 0.001*sector + 0.001*preparing + 0.001*plus + 0.001*unbelievably + 0.001*corps + 0.001*southern + 0.001*release + 0.001*pollution + 0.001*grid + 0.001*municipalities + 0.001*hill'),
 (3,
  '0.001*bible + 0.001*arran

In [81]:
hdptopics = hdpmodel.show_topics(formatted=False)

In [84]:
lda1 = hdpmodel.suggested_lda_model()

2017-06-02 06:14:50,347 : INFO : using symmetric eta at 9.05879155721e-05
2017-06-02 06:14:50,359 : INFO : using serial LDA version on this node


In [91]:
# out of 150 topics, shows top 20 HDP topics
lda1.print_topics()

2017-06-02 06:17:33,817 : INFO : topic #111 (0.007): 0.000*"saddam" + 0.000*"told" + 0.000*"accurate" + 0.000*"palestinian" + 0.000*"redstone" + 0.000*"temptation" + 0.000*"customer" + 0.000*"carmakers" + 0.000*"barnes" + 0.000*"grandson"
2017-06-02 06:17:33,819 : INFO : topic #147 (0.006): 0.000*"asking" + 0.000*"factors" + 0.000*"things" + 0.000*"leaning" + 0.000*"attachment" + 0.000*"longer" + 0.000*"slightest" + 0.000*"ministry" + 0.000*"pictures" + 0.000*"loophole"
2017-06-02 06:17:33,821 : INFO : topic #129 (0.006): 0.000*"australian" + 0.000*"firestorm" + 0.000*"hell" + 0.000*"grants" + 0.000*"reliance" + 0.000*"sullivan" + 0.000*"awakening" + 0.000*"brought" + 0.000*"exceptions" + 0.000*"heading"
2017-06-02 06:17:33,823 : INFO : topic #82 (0.007): 0.000*"rays" + 0.000*"flat" + 0.000*"convoluted" + 0.000*"hefty" + 0.000*"colorado" + 0.000*"defect" + 0.000*"passions" + 0.000*"estimated" + 0.000*"happening" + 0.000*"valued"
2017-06-02 06:17:33,824 : INFO : topic #135 (0.006): 0.00

[(111,
  '0.000*"saddam" + 0.000*"told" + 0.000*"accurate" + 0.000*"palestinian" + 0.000*"redstone" + 0.000*"temptation" + 0.000*"customer" + 0.000*"carmakers" + 0.000*"barnes" + 0.000*"grandson"'),
 (147,
  '0.000*"asking" + 0.000*"factors" + 0.000*"things" + 0.000*"leaning" + 0.000*"attachment" + 0.000*"longer" + 0.000*"slightest" + 0.000*"ministry" + 0.000*"pictures" + 0.000*"loophole"'),
 (129,
  '0.000*"australian" + 0.000*"firestorm" + 0.000*"hell" + 0.000*"grants" + 0.000*"reliance" + 0.000*"sullivan" + 0.000*"awakening" + 0.000*"brought" + 0.000*"exceptions" + 0.000*"heading"'),
 (82,
  '0.000*"rays" + 0.000*"flat" + 0.000*"convoluted" + 0.000*"hefty" + 0.000*"colorado" + 0.000*"defect" + 0.000*"passions" + 0.000*"estimated" + 0.000*"happening" + 0.000*"valued"'),
 (135,
  '0.000*"earlier" + 0.000*"adding" + 0.000*"cow" + 0.000*"doing" + 0.000*"flights" + 0.000*"disabled" + 0.000*"unconventional" + 0.000*"declare" + 0.000*"charleston" + 0.000*"fuller"'),
 (139,
  '0.000*"glasse

In [104]:
lda1_corpus = lda1[tfidf_corpus]

In [105]:
nyt_lda1 = matutils.corpus2dense(lda1_corpus, num_terms=150).transpose()

In [106]:
df3 = pd.DataFrame(nyt_lda1)

In [107]:
df3.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,140,141,142,143,144,145,146,147,148,149
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.323609,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.790793,0.0,0.0,0.0


In [108]:
# take the mean of every word vector!  (averaged across all document vectors)
# describes word usage ('meaning') across the body of documents in the nyt corpus
# answers the question: what 'topics' has the nyt been talking about the most over 2005-2015?
df3.mean().sort_values()

144    0.003471
76     0.003665
135    0.003696
139    0.003841
132    0.003888
52     0.003994
136    0.003998
96     0.004022
62     0.004050
147    0.004075
13     0.004093
47     0.004097
82     0.004109
141    0.004128
77     0.004152
8      0.004168
71     0.004184
64     0.004188
36     0.004199
129    0.004231
65     0.004245
137    0.004250
42     0.004353
134    0.004358
110    0.004387
143    0.004419
97     0.004423
75     0.004429
93     0.004432
149    0.004433
         ...   
17     0.005835
84     0.005838
98     0.005863
41     0.005936
57     0.005996
80     0.006080
22     0.006113
30     0.006144
83     0.006164
44     0.006165
72     0.006175
126    0.006192
85     0.006252
103    0.006324
118    0.006374
21     0.006392
19     0.006467
66     0.006474
61     0.006513
39     0.006539
6      0.006580
4      0.006592
100    0.006675
125    0.006708
130    0.006792
131    0.006874
114    0.007066
45     0.007387
12     0.007485
7      0.007775
Length: 150, dtype: floa

# Do an LDA here

In [115]:
lda = models.LdaModel(corpus=tfidf_corpus, num_topics=20, id2word=id2word, passes=3)
# LDA does not scale super well.  It can get you great results on 100,000 docs, but 1000 topics on 10e7 docs and it does a poor job.

# LDA is a good latent feature for unsupervised clustering

2017-06-02 06:32:53,865 : INFO : using symmetric alpha at 0.05
2017-06-02 06:32:53,869 : INFO : using symmetric eta at 9.05879155721e-05
2017-06-02 06:32:53,876 : INFO : using serial LDA version on this node
2017-06-02 06:32:55,046 : INFO : running online LDA training, 20 topics, 3 passes over the supplied corpus of 45661 documents, updating model once every 2000 documents, evaluating perplexity every 20000 documents, iterating 50x with a convergence threshold of 0.001000
2017-06-02 06:32:55,072 : INFO : PROGRESS: pass 0, at document #2000/45661
2017-06-02 06:32:56,465 : INFO : merging changes from 2000 documents into a model of 45661 documents
2017-06-02 06:32:56,554 : INFO : topic #3 (0.050): 0.006*"new" + 0.004*"company" + 0.004*"house" + 0.004*"said" + 0.003*"war" + 0.003*"businesses" + 0.003*"mr" + 0.003*"run" + 0.003*"power" + 0.003*"don"
2017-06-02 06:32:56,556 : INFO : topic #16 (0.050): 0.004*"companies" + 0.004*"said" + 0.004*"big" + 0.004*"company" + 0.003*"new" + 0.003*"str

2017-06-02 06:33:08,115 : INFO : topic #19 (0.050): 0.009*"president" + 0.008*"transcript" + 0.008*"obama" + 0.007*"union" + 0.007*"following" + 0.006*"numbers" + 0.006*"unemployment" + 0.005*"speech" + 0.005*"address" + 0.005*"state"
2017-06-02 06:33:08,118 : INFO : topic diff=0.429399, rho=0.408248
2017-06-02 06:33:08,140 : INFO : PROGRESS: pass 0, at document #14000/45661
2017-06-02 06:33:09,466 : INFO : merging changes from 2000 documents into a model of 45661 documents
2017-06-02 06:33:09,568 : INFO : topic #3 (0.050): 0.008*"percent" + 0.007*"economy" + 0.006*"growth" + 0.005*"jobless" + 0.005*"said" + 0.005*"new" + 0.004*"businesses" + 0.004*"spending" + 0.004*"forecast" + 0.004*"emissions"
2017-06-02 06:33:09,569 : INFO : topic #17 (0.050): 0.007*"percent" + 0.006*"company" + 0.006*"goods" + 0.006*"exports" + 0.005*"billion" + 0.005*"united" + 0.005*"said" + 0.005*"states" + 0.005*"deficit" + 0.005*"provided"
2017-06-02 06:33:09,570 : INFO : topic #1 (0.050): 0.008*"european" +

2017-06-02 06:33:18,137 : INFO : topic #4 (0.050): 0.007*"learn" + 0.006*"schools" + 0.006*"marriage" + 0.006*"students" + 0.006*"test" + 0.006*"school" + 0.005*"college" + 0.005*"airlines" + 0.005*"rural" + 0.004*"isn"
2017-06-02 06:33:18,139 : INFO : topic diff=0.196794, rho=0.288675
2017-06-02 06:33:18,155 : INFO : PROGRESS: pass 0, at document #26000/45661
2017-06-02 06:33:19,427 : INFO : merging changes from 2000 documents into a model of 45661 documents
2017-06-02 06:33:19,504 : INFO : topic #18 (0.050): 0.006*"market" + 0.005*"percent" + 0.005*"investors" + 0.005*"oil" + 0.005*"prices" + 0.005*"government" + 0.005*"economy" + 0.004*"rates" + 0.004*"increased" + 0.004*"housing"
2017-06-02 06:33:19,506 : INFO : topic #3 (0.050): 0.009*"mexican" + 0.007*"economy" + 0.006*"russia" + 0.005*"climate" + 0.005*"born" + 0.005*"provisions" + 0.004*"prices" + 0.004*"new" + 0.004*"figures" + 0.004*"government"
2017-06-02 06:33:19,507 : INFO : topic #8 (0.050): 0.010*"rubio" + 0.009*"marco" 

2017-06-02 06:33:25,445 : INFO : topic #12 (0.050): 0.007*"nuclear" + 0.006*"scientists" + 0.005*"limited" + 0.004*"mean" + 0.004*"material" + 0.004*"compromise" + 0.004*"priests" + 0.004*"research" + 0.004*"murder" + 0.004*"children"
2017-06-02 06:33:25,447 : INFO : topic diff=0.116279, rho=0.235702
2017-06-02 06:33:25,463 : INFO : PROGRESS: pass 0, at document #38000/45661
2017-06-02 06:33:26,433 : INFO : merging changes from 2000 documents into a model of 45661 documents
2017-06-02 06:33:26,502 : INFO : topic #12 (0.050): 0.006*"nuclear" + 0.005*"scientists" + 0.005*"murder" + 0.005*"mean" + 0.004*"material" + 0.004*"limited" + 0.004*"wounded" + 0.004*"compromise" + 0.004*"convicted" + 0.004*"arnold"
2017-06-02 06:33:26,503 : INFO : topic #6 (0.050): 0.007*"readers" + 0.007*"attorney" + 0.007*"terrorists" + 0.006*"suspect" + 0.006*"questions" + 0.005*"swiss" + 0.005*"times" + 0.005*"remarks" + 0.005*"democracy" + 0.005*"settlement"
2017-06-02 06:33:26,505 : INFO : topic #4 (0.050): 

2017-06-02 06:33:44,406 : INFO : topic #6 (0.050): 0.006*"readers" + 0.006*"reaction" + 0.005*"suspect" + 0.005*"attorney" + 0.005*"county" + 0.005*"sean" + 0.005*"questions" + 0.005*"times" + 0.004*"editor" + 0.004*"democracy"
2017-06-02 06:33:44,408 : INFO : topic #11 (0.050): 0.008*"intelligence" + 0.008*"location" + 0.007*"al" + 0.007*"brown" + 0.007*"jr" + 0.006*"council" + 0.006*"transgender" + 0.006*"police" + 0.006*"killing" + 0.006*"jerry"
2017-06-02 06:33:44,410 : INFO : topic diff=0.090447, rho=0.200681
2017-06-02 06:33:44,435 : INFO : PROGRESS: pass 1, at document #4000/45661
2017-06-02 06:33:45,399 : INFO : merging changes from 2000 documents into a model of 45661 documents
2017-06-02 06:33:45,460 : INFO : topic #5 (0.050): 0.008*"reddit" + 0.008*"president" + 0.006*"gun" + 0.006*"obama" + 0.006*"new" + 0.005*"phone" + 0.005*"elect" + 0.004*"aren" + 0.004*"times" + 0.004*"downturn"
2017-06-02 06:33:45,461 : INFO : topic #3 (0.050): 0.017*"heat" + 0.016*"feed" + 0.013*"late

2017-06-02 06:33:51,080 : INFO : topic #12 (0.050): 0.005*"statistics" + 0.005*"iphone" + 0.004*"bond" + 0.004*"like" + 0.004*"modern" + 0.004*"applications" + 0.004*"scientists" + 0.004*"people" + 0.003*"limited" + 0.003*"earned"
2017-06-02 06:33:51,081 : INFO : topic diff=0.082330, rho=0.200681
2017-06-02 06:33:51,096 : INFO : PROGRESS: pass 1, at document #16000/45661
2017-06-02 06:33:52,120 : INFO : merging changes from 2000 documents into a model of 45661 documents
2017-06-02 06:33:52,178 : INFO : topic #8 (0.050): 0.008*"michigan" + 0.008*"bonuses" + 0.008*"st" + 0.007*"restore" + 0.007*"stopped" + 0.006*"worse" + 0.006*"crimes" + 0.006*"cnbc" + 0.005*"discrimination" + 0.005*"lender"
2017-06-02 06:33:52,179 : INFO : topic #7 (0.050): 0.013*"romney" + 0.011*"mitt" + 0.009*"clinton" + 0.008*"senator" + 0.007*"hillary" + 0.006*"campaign" + 0.006*"presidential" + 0.006*"republican" + 0.006*"rodham" + 0.006*"john"
2017-06-02 06:33:52,181 : INFO : topic #14 (0.050): 0.010*"president" 

2017-06-02 06:33:59,598 : INFO : topic #10 (0.050): 0.015*"percent" + 0.010*"said" + 0.010*"year" + 0.010*"company" + 0.008*"quarter" + 0.008*"sales" + 0.008*"million" + 0.007*"billion" + 0.006*"rose" + 0.006*"pr

In [116]:
# Let's take a look at what happened.  Here are the 10 most important words for each of the 3 topics we found:
lda.print_topics()

2017-06-02 06:35:05,891 : INFO : topic #0 (0.050): 0.021*"court" + 0.012*"supreme" + 0.008*"federal" + 0.008*"law" + 0.008*"judge" + 0.006*"state" + 0.005*"ruled" + 0.005*"president" + 0.005*"judges" + 0.005*"appeals"
2017-06-02 06:35:05,894 : INFO : topic #1 (0.050): 0.011*"paris" + 0.009*"european" + 0.006*"bank" + 0.005*"cnn" + 0.005*"euro" + 0.005*"central" + 0.005*"financial" + 0.005*"facebook" + 0.004*"regulators" + 0.004*"nations"
2017-06-02 06:35:05,897 : INFO : topic #2 (0.050): 0.012*"tweets" + 0.009*"terror" + 0.009*"weather" + 0.008*"reactions" + 0.008*"crisis" + 0.007*"solar" + 0.007*"hate" + 0.007*"negative" + 0.007*"abuse" + 0.007*"victims"
2017-06-02 06:35:05,900 : INFO : topic #3 (0.050): 0.026*"heat" + 0.026*"feed" + 0.021*"latest" + 0.019*"climate" + 0.012*"featured" + 0.010*"like" + 0.009*"minister" + 0.008*"change" + 0.008*"regulations" + 0.007*"prime"
2017-06-02 06:35:05,902 : INFO : topic #4 (0.050): 0.035*"thanks" + 0.017*"comments" + 0.012*"gay" + 0.011*"marria

[(0,
  '0.021*"court" + 0.012*"supreme" + 0.008*"federal" + 0.008*"law" + 0.008*"judge" + 0.006*"state" + 0.005*"ruled" + 0.005*"president" + 0.005*"judges" + 0.005*"appeals"'),
 (1,
  '0.011*"paris" + 0.009*"european" + 0.006*"bank" + 0.005*"cnn" + 0.005*"euro" + 0.005*"central" + 0.005*"financial" + 0.005*"facebook" + 0.004*"regulators" + 0.004*"nations"'),
 (2,
  '0.012*"tweets" + 0.009*"terror" + 0.009*"weather" + 0.008*"reactions" + 0.008*"crisis" + 0.007*"solar" + 0.007*"hate" + 0.007*"negative" + 0.007*"abuse" + 0.007*"victims"'),
 (3,
  '0.026*"heat" + 0.026*"feed" + 0.021*"latest" + 0.019*"climate" + 0.012*"featured" + 0.010*"like" + 0.009*"minister" + 0.008*"change" + 0.008*"regulations" + 0.007*"prime"'),
 (4,
  '0.035*"thanks" + 0.017*"comments" + 0.012*"gay" + 0.011*"marriage" + 0.010*"email" + 0.009*"judicial" + 0.009*"school" + 0.008*"schools" + 0.007*"sex" + 0.006*"students"'),
 (5,
  '0.011*"gun" + 0.007*"reddit" + 0.007*"withdrawing" + 0.006*"covfefe" + 0.006*"shootin

In [117]:
# Transform the docs from the word space to the topic space (like "transform" in sklearn)
lda_corpus = lda[tfidf_corpus] #corpus is the data

# lists the topic distribution per document:  
# list(lda_corpus)

In [59]:
# Store the documents' topic vectors in a list so we can take a peak
lda_docs = [doc for doc in lda_corpus]

In [60]:
# Check out the document vectors in the topic space for the first 15 documents
lda_docs[0:15]

[[(0, 0.2461201379684409), (1, 0.16269826230316989), (2, 0.59118159972838913)],
 [(0, 0.74130772103371301),
  (1, 0.20812329956249245),
  (2, 0.050568979403794567)],
 [(0, 0.24121405864154841),
  (1, 0.11379733246971466),
  (2, 0.64498860888873699)],
 [(0, 0.7253616828269015),
  (1, 0.092141146246419781),
  (2, 0.18249717092667864)],
 [(0, 0.51561021131153173), (1, 0.24882156935503244), (2, 0.2355682193334358)],
 [(0, 0.086164706706201327),
  (1, 0.41195007470157691),
  (2, 0.50188521859222168)],
 [(0, 0.48883320901635291),
  (1, 0.13133545808781241),
  (2, 0.37983133289583465)],
 [(0, 0.70698357917858445),
  (1, 0.22146639337741511),
  (2, 0.071550027444000547)],
 [(0, 0.11129339192242074),
  (1, 0.051941022095322822),
  (2, 0.83676558598225648)],
 [(0, 0.056106952790293853),
  (1, 0.40552638320379064),
  (2, 0.53836666400591549)],
 [(0, 0.19504077789498345),
  (1, 0.74046462525553269),
  (2, 0.064494596849483915)],
 [(0, 0.083890711870936996),
  (1, 0.067678563067464836),
  (2, 0.848

In [118]:
nyt_lda = matutils.corpus2dense(lda_corpus, num_terms=20).transpose()
df3 = pd.DataFrame(nyt_lda)

In [120]:
df3.mean().sort_values(ascending=False).head(10)

18    0.108904
10    0.089831
9     0.081572
15    0.067136
0     0.067074
14    0.059336
1     0.055699
7     0.051737
13    0.042247
19    0.041734
dtype: float32

## Logistic Regression / Random Forest
- <s>Tried KNN Classifier </s>   Destroyed my memory
- probabilistic classification on a spectrum from nyt to natl enq

In [None]:
# remember to pull in the final article dumps from EC2 instance