# Model Development V1
- This is really more like scratchwork
- Divide this into multiple notebooks for easier reading

**Reference**
- http://zacstewart.com/2014/08/05/pipelines-of-featureunions-of-pipelines.html

In [1]:
import json
import pickle
from pymongo import MongoClient
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
%matplotlib inline

import nltk
import os
from nltk.corpus import stopwords
from sklearn.utils.extmath import randomized_svd

# gensim
from gensim import corpora, models, similarities, matutils
# sklearn
from sklearn import datasets
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from sklearn.neighbors import KNeighborsClassifier
import sklearn.metrics.pairwise as smp


import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

# NYT Corpus

## Read data in
- pickle from mongod output on amazon ec2 instance

scp -i ~/.ssh/aws_andrew andrew@35.166.29.151:/home/andrew/Notebooks/initial-model-df.pkl ~/ds/metis/challenges/

In [2]:
with open('initial-model-df.pkl', 'rb') as nyt_data:
    df = pickle.load(nyt_data)

In [3]:
df.shape

(63097, 13)

In [4]:
df.columns

Index(['_id', 'date', 'desk', 'headline', 'id', 'lead_paragraph', 'locations',
       'section', 'source', 'subjects', 'type', 'url', 'word_count'],
      dtype='object')

In [5]:
df.head(30)

Unnamed: 0,_id,date,desk,headline,id,lead_paragraph,locations,section,source,subjects,type,url,word_count
0,5929421ebb9b17228ec4bc32,2006-01-01,Money and Business/Financial Desk,"b""At the Outset, It's a Matter Of Mood""",4fd24fd28eb7c8105d7f2980,"b""THE first week of the year may begin with tr...",[],Business,The New York Times,[],News,https://www.nytimes.com/2006/01/01/business/yo...,435.0
1,5929421ebb9b17228ec4bc33,2006-01-01,Money and Business/Financial Desk,b'Details Man to the Rescue',4fd24fd28eb7c8105d7f297e,b'I ALWAYS wanted to be a journalist and cover...,[],Business,The New York Times,[BIOGRAPHICAL INFORMATION],News,https://www.nytimes.com/2006/01/01/business/yo...,772.0
2,5929421ebb9b17228ec4bc35,2006-01-01,National Desk,b'Asking for 5 Percent More To Help Pay for th...,4fd24fd28eb7c8105d7f2986,"b""The Original Gourmet Brunch, a restaurant on...",[MASSACHUSETTS],U.S.,The New York Times,"[GAS (FUEL), RESTAURANTS, PRICES (FARES, FEES ...",News,https://www.nytimes.com/2006/01/01/national/01...,467.0
3,5929421ebb9b17228ec4bc36,2006-01-01,Magazine,b'Snow Daze',4fd262fd8eb7c8105d81514b,"b""Not too long ago it snowed in New York City....",[],Magazine,The New York Times,"[COMEDY AND HUMOR, SLEDS, SNOW AND SNOWSTORMS]",News,https://www.nytimes.com/2006/01/01/magazine/01...,989.0
4,5929421ebb9b17228ec4bc37,2006-01-01,Money and Business/Financial Desk,"b""Don't Want to Leave the Hotel? Buy the Room""",4fd269328eb7c8105d81f967,b'IMAGINE loving a hotel room so much that you...,[],Business,The New York Times,"[CONDOMINIUMS, HOTELS AND MOTELS, HOUSING, REN...",News,https://www.nytimes.com/2006/01/01/business/yo...,1350.0
5,5929421ebb9b17228ec4bd20,2006-02-04,Business,b'OPEC Chief Shrugs Off Oil Politics',5176ee5ecf28d02a61002941,"b'Gone are the militant years, when OPEC minis...",[],Business Day,The New York Times,[],News,https://www.nytimes.com/2006/02/04/business/wo...,1067.0
6,5929421ebb9b17228ec4bdd9,,,,,,[],,,[],,,
7,5929421ebb9b17228ec4bdda,,,,,,[],,,[],,,
8,5929421ebb9b17228ec4bc34,2006-01-01,Money and Business/Financial Desk,b'Novel Thinking As a Survival Tactic',4fd269328eb7c8105d81f963,"b""LARGE American companies, not just small sta...",[],Science; Technology; Business,The New York Times,"[UNITED STATES ECONOMY, HIRING AND PROMOTION, ...",Interview,https://www.nytimes.com/2006/01/01/business/yo...,890.0
9,5929421ebb9b17228ec4bc38,2006-01-01,Magazine,b'The Case For Contamination',4fd287918eb7c8105d8590a5,"b""1. I'm seated, with my mother, on a palace v...",[],Magazine,The New York Times,"[SOCIAL CONDITIONS AND TRENDS, INTERNATIONAL T...",News,https://www.nytimes.com/2006/01/01/magazine/01...,7298.0


In [46]:
df1 = df.dropna()

## LSI Preprocessing

In [47]:
# docs = data['lead_paragraph'][0:100]
docs = df1['lead_paragraph']

In [48]:
docs.shape

(65730,)

In [49]:
for doc in docs:
    doc = doc.decode("utf8")

In [50]:
# create a list of stopwords
stopwords_set = frozenset(stopwords.words('english'))

# Update iterator to remove stopwords
class SentencesIterator(object):
    # giving 'stop' a list of stopwords would exclude them
    def __init__(self, dirname, stop=None):
        self.dirname = dirname
 
    def __iter__(self):
        # os.listdr is ALSO a generator
        for fname in os.listdir(self.dirname):
            for line in open(os.path.join(self.dirname, fname),encoding="latin-1"):
                # at each step, gensim needs a list of words
                line = line.lower().split()
                if stop:
                    outline = []                
                    for word in line:
                        if word not in stopwords_set:
                            outline.append(word)
                    yield outline
                else:
                    yield line

In [51]:
docs1 = docs.dropna()

In [52]:
for doc in docs1:
    doc = SentencesIterator(doc.decode("utf8"))

In [53]:
docs = pd.Series.tolist(docs1)

In [54]:
tfidf = TfidfVectorizer(stop_words="english", 
                        token_pattern="\\b[a-zA-Z][a-zA-Z]+\\b", 
                        min_df=10)

tfidf_vecs = tfidf.fit_transform(docs)

In [55]:
tfidf_vecs.shape

# it's too big to see in a dataframe:
# pd.DataFrame(tfidf_vecs.todense(), 
#              columns=tfidf.get_feature_names()
#             ).head(30)

(65730, 11552)

## BASELINE:  Multinomial Naive Bayes Classification
- language is fundamentally different
- captures word choice

In [19]:
pd.DataFrame(tfidf_vecs.todense(), 
             columns=tfidf.get_feature_names()
            ).head()

Unnamed: 0,aaron,aarp,abandon,abandoned,abandoning,abbott,abc,abdullah,abe,abiding,...,zero,zhang,zimmerman,zip,zone,zones,zoo,zoom,zuckerberg,zurich
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
df1.shape, tfidf_vecs.shape

((58727, 13), (58727, 10982))

In [21]:
# Train/Test split
X_train, X_test, y_train, y_test = train_test_split(tfidf_vecs, df1['source'], test_size=0.33)

# Train 
nb = MultinomialNB()
nb.fit(X_train, y_train)

# Test 
nb.score(X_test, y_test)

0.95330237358101133

# LSI Begin
**Essentially, this has been my workflow so far:**
1. TFIDF in sklearn --> output a sparse corpus matrix DTM
2. LSI (SVD) in gensim --> output a 300 dim matrix TDM
  - Analyze topic vectors
3. Viewed LSI[tfidf]

In [58]:
# terms by docs instead of docs by terms
tfidf_corpus = matutils.Sparse2Corpus(tfidf_vecs.transpose())

# Row indices
id2word = dict((v, k) for k, v in tfidf.vocabulary_.items())

# This is a hack for Python 3!
id2word = corpora.Dictionary.from_corpus(tfidf_corpus, 
                                         id2word=id2word)

2017-06-02 06:53:08,735 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2017-06-02 06:53:09,009 : INFO : adding document #10000 to Dictionary(0 unique tokens: [])
2017-06-02 06:53:09,250 : INFO : adding document #20000 to Dictionary(0 unique tokens: [])
2017-06-02 06:53:09,472 : INFO : adding document #30000 to Dictionary(0 unique tokens: [])
2017-06-02 06:53:09,754 : INFO : adding document #40000 to Dictionary(0 unique tokens: [])
2017-06-02 06:53:09,940 : INFO : adding document #50000 to Dictionary(0 unique tokens: [])
2017-06-02 06:53:10,197 : INFO : adding document #60000 to Dictionary(0 unique tokens: [])
2017-06-02 06:53:10,354 : INFO : built Dictionary(11552 unique tokens: ['week', 'year', 'begin', 'traders', 'following']...) from 65730 documents (total 241491 corpus positions)


In [23]:
# Build an LSI space from the input TFIDF matrix, mapping of row id to word, and num_topics
# num_topics is the number of dimensions (k) to reduce to after the SVD

# Analagous to "fit" in sklearn, it primes an LSI space trained to 300-500 dimensions
lsi = models.LsiModel(tfidf_corpus, id2word=id2word, num_topics=300)

2017-06-01 17:57:40,661 : INFO : using serial LSI version on this node
2017-06-01 17:57:40,663 : INFO : updating model with new documents
2017-06-01 17:57:40,897 : INFO : preparing a new chunk of documents
2017-06-01 17:57:41,118 : INFO : using 100 extra samples and 2 power iterations
2017-06-01 17:57:41,119 : INFO : 1st phase: constructing (10982, 400) action matrix
2017-06-01 17:57:41,789 : INFO : orthonormalizing (10982, 400) action matrix
2017-06-01 17:57:44,333 : INFO : 2nd phase: running dense svd on (400, 20000) matrix
2017-06-01 17:57:45,361 : INFO : computing the final decomposition
2017-06-01 17:57:45,362 : INFO : keeping 300 factors (discarding 13.515% of energy spectrum)
2017-06-01 17:57:45,466 : INFO : processed documents up to #20000
2017-06-01 17:57:45,473 : INFO : topic #0(13.238): 0.275*"said" + 0.251*"company" + 0.216*"percent" + 0.189*"billion" + 0.181*"year" + 0.175*"new" + 0.142*"million" + 0.120*"quarter" + 0.111*"president" + 0.109*"business"
2017-06-01 17:57:45,

In [24]:
# Retrieve vectors for the original tfidf corpus in the LSI space ("transform" in sklearn)
lsi_corpus = lsi[tfidf_corpus] # pass using square brackets
# what are the values given by lsi? (topic distributions)

# ALSO, IT IS LAZY!  IT WON'T ACTUALLY DO THE TRANSFORMING COMPUTATION UNTIL ITS CALLED.  IT STORES THE INSTRUCTIONS

# Dump the resulting document vectors into a list so we can take a look
doc_vecs = [doc for doc in lsi_corpus]
doc_vecs[0] #print the first document vector for all the words

[(0, 0.15575247973568823),
 (1, -0.10482039534286403),
 (2, -0.079362271844062429),
 (3, -0.021640465608944625),
 (4, 0.07018983108364607),
 (5, -0.031455489453230588),
 (6, 0.048210610433166587),
 (7, -0.04094914984963096),
 (8, -0.046327069120478556),
 (9, -0.042841939719776061),
 (10, 0.028613628929569951),
 (11, 0.060629884126256361),
 (12, -0.008442274771832738),
 (13, 0.078020542972305623),
 (14, 0.047934948081525798),
 (15, -0.037202567002021701),
 (16, 0.031145050244952406),
 (17, 0.013688993756039077),
 (18, -0.07164810606095956),
 (19, -0.078581675007556737),
 (20, 0.02336945614271186),
 (21, -0.061068968445096239),
 (22, 0.048693468910641274),
 (23, -0.063838698453958925),
 (24, 0.098337952363926992),
 (25, 0.0019824190323215803),
 (26, 0.047152397266372117),
 (27, 0.0098798772524278412),
 (28, -0.039634415243872949),
 (29, 0.040863636805580812),
 (30, 0.020628349719208718),
 (31, 0.067998011645003034),
 (32, 0.046230882959977709),
 (33, 0.12376669691212144),
 (34, -0.003492

## Doc-Term Cosine Similarity using LSI Corpus
- cosine similarity of [docs to terms](http://localhost:8888/notebooks/ds/metis/classnotes/5.24.17%20Vector%20Space%20Models%2C%20NMF%2C%20W2V.ipynb#Toy-Example:-Conceptual-Similarity-Between-Arbitrary-Text-Blobs)

In [25]:
# Convert the gensim-style corpus vecs to a numpy array for sklearn manipulations
nyt_lsi = matutils.corpus2dense(lsi_corpus, num_terms=300).transpose()

In [26]:
nyt_lsi.shape

(58727, 300)

In [27]:
lsi.show_topic(0)

[('president', 0.23522255614790213),
 ('said', 0.22246238960996406),
 ('new', 0.19309051989605105),
 ('obama', 0.19258154533955249),
 ('year', 0.15673502633961953),
 ('percent', 0.1501776523942312),
 ('health', 0.13714832771983579),
 ('company', 0.12390727158786559),
 ('federal', 0.12254441626987078),
 ('billion', 0.11362082634235728)]

In [28]:
# Create an index transformer that calculates similarity based on our space
index = similarities.MatrixSimilarity(lsi_corpus, num_features=len(id2word))

2017-06-01 17:58:20,168 : INFO : creating matrix with 58727 documents and 10982 features


In [29]:
# all docs by 300 topic vectors (word vectors)
pd.DataFrame(nyt_lsi).head()

# need to transform by cosine similarity
# look up if I need to change into an LDA corpus

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
0,0.155643,-0.104799,-0.079692,-0.021232,0.069545,-0.030717,0.048921,0.041394,-0.046861,-0.042154,...,-0.015778,0.019955,-0.016297,0.00212,-0.003223,0.011393,-0.016167,0.00797,0.016439,-0.049429
1,0.089255,-0.01692,0.060164,0.062186,0.035706,0.055762,0.031934,0.020874,0.036525,0.011484,...,-0.004859,-0.021652,-0.007746,0.019102,-0.018241,0.003736,0.056186,0.025089,-0.002191,0.000423
2,0.099545,-0.061959,-0.01366,-0.021644,0.010869,-0.011716,0.01517,-0.002044,-0.00143,-0.009607,...,0.012332,0.007469,0.054603,0.009606,-0.023122,-0.014311,-0.019277,0.004845,-0.005152,0.026924
3,0.083508,-0.007581,0.044036,0.058782,0.038434,0.049601,0.043257,0.018161,0.028527,0.028337,...,0.004282,-0.017351,-0.012027,-0.013898,0.006955,0.013429,0.030023,-0.003201,-0.001473,-0.014914
4,0.042372,-0.019844,-0.002592,0.008795,0.00779,0.020612,0.001433,-0.006698,-0.021577,-0.020549,...,-0.019867,-0.014974,0.000478,-0.030875,0.043913,0.02833,-0.037282,0.022985,-0.01012,-0.007691


In [None]:
# take the mean of every word vector!  (averaged across all document vectors)
df.mean()

In [None]:
# describes word usage ('meaning') across the body of documents in the nyt corpus
# answers the question: what 'topics' has the nyt been talking about the most over 2005-2015?
df.mean().sort_values()

# Sorted doc-doc cosine similarity!

In [None]:
# Create an index transformer that calculates similarity based on our space
index = similarities.MatrixSimilarity(doc_vecs, num_features=len(id2word))

In [None]:
# Return the sorted list of cosine similarities to the first document
sims = sorted(enumerate(index[doc_vecs[0]]), key=lambda item: -item[1])
sims

# Document 1491 is very similar (.66) to document 0

In [None]:
# Let's take a look at how we did by analyzing syntax
for sim_doc_id, sim_score in enumerate(sims[0:30]): 
    print("DocumentID: {}, Similarity Score: {} ".format(sim_score[0], sim_score[1]))
    print("Headline: " + str(df1.iloc[sim_doc_id].headline.decode('utf-8')))
    print("Lead Paragraph: " + str(df1.iloc[sim_doc_id].lead_paragraph.decode('utf-8')))
    print("Publish Date: " + str(df1.iloc[sim_doc_id].date))
    print('\n')

## Pass into KMeans Clustering

In [29]:
# Convert the gensim-style corpus vecs to a numpy array for sklearn manipulations (back to docs to terms matrix)
nyt_lsi = matutils.corpus2dense(lsi_corpus, num_terms=300).transpose()
nyt_lsi.shape

(58727, 300)

In [30]:
# Create KMeans.  
kmeans = KMeans(n_clusters=3)

# Cluster
nyt_lsi_clusters = kmeans.fit_predict(nyt_lsi)

In [31]:
# Take a look.  It likely didn't do cosine distances.
print(nyt_lsi_clusters[0:50])
print("Lead Paragraph: \n" + str(df1.iloc[0:5].lead_paragraph))

[0 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 2 0 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 0 1 1]
Lead Paragraph: 
0    b"THE first week of the year may begin with tr...
1    b'I ALWAYS wanted to be a journalist and cover...
2    b"The Original Gourmet Brunch, a restaurant on...
3    b"Not too long ago it snowed in New York City....
4    b'IMAGINE loving a hotel room so much that you...
Name: lead_paragraph, dtype: object


## LSA Begin

In [59]:
lda = models.LdaModel(corpus=tfidf_corpus, num_topics=20, id2word=id2word, passes=3)

lda.print_topics()

2017-06-02 06:53:20,196 : INFO : using symmetric alpha at 0.05
2017-06-02 06:53:20,198 : INFO : using symmetric eta at 8.65650969529e-05
2017-06-02 06:53:20,202 : INFO : using serial LDA version on this node
2017-06-02 06:53:21,447 : INFO : running online LDA training, 20 topics, 3 passes over the supplied corpus of 65730 documents, updating model once every 2000 documents, evaluating perplexity every 20000 documents, iterating 50x with a convergence threshold of 0.001000
2017-06-02 06:53:21,466 : INFO : PROGRESS: pass 0, at document #2000/65730
2017-06-02 06:53:22,792 : INFO : merging changes from 2000 documents into a model of 65730 documents
2017-06-02 06:53:22,889 : INFO : topic #2 (0.050): 0.005*"new" + 0.003*"said" + 0.003*"billion" + 0.003*"commercials" + 0.003*"year" + 0.003*"play" + 0.003*"insurance" + 0.003*"years" + 0.003*"time" + 0.002*"come"
2017-06-02 06:53:22,890 : INFO : topic #4 (0.050): 0.005*"new" + 0.004*"said" + 0.004*"senator" + 0.003*"mr" + 0.003*"group" + 0.003*

2017-06-02 06:53:31,097 : INFO : topic #6 (0.050): 0.006*"union" + 0.006*"professionals" + 0.006*"pushing" + 0.006*"courses" + 0.005*"president" + 0.005*"delivery" + 0.005*"state" + 0.005*"obama" + 0.005*"parent" + 0.004*"workers"
2017-06-02 06:53:31,099 : INFO : topic diff=0.360791, rho=0.408248
2017-06-02 06:53:31,126 : INFO : PROGRESS: pass 0, at document #14000/65730
2017-06-02 06:53:33,120 : INFO : merging changes from 2000 documents into a model of 65730 documents
2017-06-02 06:53:33,220 : INFO : topic #15 (0.050): 0.005*"news" + 0.005*"service" + 0.005*"south" + 0.005*"transcript" + 0.005*"carolina" + 0.005*"debate" + 0.005*"president" + 0.005*"federal" + 0.004*"college" + 0.004*"following"
2017-06-02 06:53:33,221 : INFO : topic #1 (0.050): 0.006*"airlines" + 0.006*"valley" + 0.005*"european" + 0.005*"health" + 0.005*"silicon" + 0.005*"travelers" + 0.004*"approval" + 0.004*"crisis" + 0.004*"bad" + 0.004*"obama"
2017-06-02 06:53:33,223 : INFO : topic #5 (0.050): 0.013*"romney" + 

2017-06-02 06:53:44,737 : INFO : topic #14 (0.050): 0.014*"readers" + 0.012*"respond" + 0.006*"science" + 0.005*"letters" + 0.005*"facebook" + 0.004*"women" + 0.004*"twitter" + 0.004*"disorder" + 0.004*"humans" + 0.004*"politics"
2017-06-02 06:53:44,740 : INFO : topic diff=0.182329, rho=0.288675
2017-06-02 06:53:44,753 : INFO : PROGRESS: pass 0, at document #26000/65730
2017-06-02 06:53:45,799 : INFO : merging changes from 2000 documents into a model of 65730 documents
2017-06-02 06:53:45,880 : INFO : topic #0 (0.050): 0.005*"financial" + 0.004*"new" + 0.004*"banks" + 0.004*"said" + 0.004*"bank" + 0.004*"stores" + 0.004*"states" + 0.004*"company" + 0.004*"private" + 0.004*"account"
2017-06-02 06:53:45,881 : INFO : topic #9 (0.050): 0.006*"downturn" + 0.005*"payments" + 0.005*"obama" + 0.005*"limit" + 0.005*"reaction" + 0.004*"know" + 0.004*"female" + 0.004*"lack" + 0.004*"overhaul" + 0.004*"work"
2017-06-02 06:53:45,883 : INFO : topic #11 (0.050): 0.005*"change" + 0.004*"school" + 0.00

2017-06-02 06:53:52,364 : INFO : topic #17 (0.050): 0.008*"illegally" + 0.007*"gun" + 0.007*"spill" + 0.006*"official" + 0.006*"eric" + 0.006*"attorney" + 0.005*"department" + 0.005*"general" + 0.005*"mr" + 0.005*"shooting"
2017-06-02 06:53:52,366 : INFO : topic diff=0.107161, rho=0.235702
2017-06-02 06:53:52,379 : INFO : PROGRESS: pass 0, at document #38000/65730
2017-06-02 06:53:53,390 : INFO : merging changes from 2000 documents into a model of 65730 documents
2017-06-02 06:53:53,462 : INFO : topic #8 (0.050): 0.014*"percent" + 0.009*"year" + 0.009*"said" + 0.008*"billion" + 0.006*"economy" + 0.006*"bank" + 0.006*"growth" + 0.005*"government" + 0.005*"million" + 0.005*"china"
2017-06-02 06:53:53,463 : INFO : topic #15 (0.050): 0.014*"president" + 0.013*"obama" + 0.009*"republican" + 0.009*"house" + 0.009*"tax" + 0.007*"state" + 0.007*"party" + 0.006*"democrats" + 0.006*"republicans" + 0.005*"cuts"
2017-06-02 06:53:53,465 : INFO : topic #17 (0.050): 0.007*"eric" + 0.006*"official" + 

2017-06-02 06:54:02,673 : INFO : topic #1 (0.050): 0.012*"health" + 0.009*"care" + 0.008*"court" + 0.006*"obama" + 0.006*"supreme" + 0.006*"law" + 0.006*"administration" + 0.005*"insurance" + 0.005*"president" + 0.005*"iran"
2017-06-02 06:54:02,675 : INFO : topic diff=0.081242, rho=0.204124
2017-06-02 06:54:02,691 : INFO : PROGRESS: pass 0, at document #50000/65730
2017-06-02 06:54:03,687 : INFO : merging changes from 2000 documents into a model of 65730 documents
2017-06-02 06:54:03,751 : INFO : topic #19 (0.050): 0.011*"suspects" + 0.010*"terrorism" + 0.009*"rick" + 0.009*"detainees" + 0.007*"governments" + 0.007*"bay" + 0.005*"elect" + 0.005*"perry" + 0.005*"gov" + 0.005*"reverse"
2017-06-02 06:54:03,753 : INFO : topic #14 (0.050): 0.010*"readers" + 0.009*"respond" + 0.007*"speaker" + 0.006*"facebook" + 0.005*"house" + 0.005*"boehner" + 0.005*"senate" + 0.005*"president" + 0.005*"john" + 0.005*"paul"
2017-06-02 06:54:03,754 : INFO : topic #5 (0.050): 0.016*"romney" + 0.015*"obama" +

2017-06-02 06:54:13,571 : INFO : topic #1 (0.050): 0.016*"health" + 0.014*"court" + 0.012*"care" + 0.011*"supreme" + 0.009*"judicial" + 0.006*"law" + 0.006*"justices" + 0.005*"judiciary" + 0.005*"insurance" + 0.005*"judge"
2017-06-02 06:54:13,576 : INFO : topic #17 (0.050): 0.008*"gun" + 0.007*"justice" + 0.007*"chief" + 0.006*"judges" + 0.006*"attorney" + 0.006*"department" + 0.006*"general" + 0.006*"mr" + 0.005*"said" + 0.005*"federal"
2017-06-02 06:54:13,582 : INFO : topic diff=0.060321, rho=0.182574
2017-06-02 06:54:13,603 : INFO : PROGRESS: pass 0, at document #62000/65730
2017-06-02 06:54:14,683 : INFO : merging changes from 2000 documents into a model of 65730 documents
2017-06-02 06:54:14,744 : INFO : topic #3 (0.050): 0.007*"judges" + 0.006*"intelligence" + 0.005*"west" + 0.005*"liberal" + 0.005*"decisions" + 0.005*"ordered" + 0.005*"currency" + 0.004*"environmental" + 0.004*"water" + 0.004*"court"
2017-06-02 06:54:14,746 : INFO : topic #17 (0.050): 0.007*"gun" + 0.007*"chief"

2017-06-02 06:54:22,450 : INFO : topic #14 (0.050): 0.010*"readers" + 0.008*"respond" + 0.007*"speaker" + 0.005*"amendment" + 0.005*"facebook" + 0.005*"house" + 0.004*"magazine" + 0.004*"reid" + 0.004*"ballot" + 0.004*"senate"
2017-06-02 06:54:22,452 : INFO : topic #5 (0.050): 0.014*"clinton" + 0.0

In [60]:
lda_corpus = lda[tfidf_corpus]

In [61]:
nyt_lda = matutils.corpus2dense(lda_corpus, num_terms=20).transpose()
df3 = pd.DataFrame(nyt_lda)

In [62]:
df3.mean().sort_values(ascending=False).head(10)

8     0.134148
13    0.091355
15    0.070421
0     0.056733
1     0.052312
11    0.049195
17    0.048026
12    0.045700
7     0.044941
3     0.042192
dtype: float32

## Logistic Regression / Random Forest
- <s>Tried KNN Classifier </s>   Destroyed me
- probabilistic classification on a spectrum from nyt to natl enq

In [32]:
from sklearn.neighbors import KNeighborsClassifier
import sklearn.metrics.pairwise as smp

In [33]:
# Train/Test
X_train, X_test, y_train, y_test = train_test_split(nyt_lsi, df1['source'], 
                                                    test_size=0.33)

In [34]:
# X_train = X_train.reshape(1,-1)
# X_test = X_test.reshape(1,-1)

y_train = np.reshape(y_train.values, (-1,1))
y_test = np.reshape(y_test.values, (-1,1))

In [35]:
X_train.shape, X_test.shape

((39347, 300), (19380, 300))

In [36]:
y_train.shape, y_test.shape

((39347, 1), (19380, 1))

In [1]:
# WARNING: This ruined me
# Need pairwise Cosine for KNN

# Fit KNN classifier to training set with cosine distance.  One of the best algorithms for clustering documents
# knn = KNeighborsClassifier(n_neighbors=3, metric=smp.cosine_distances)
# knn.fit(X_train, y_train)
# knn.score(X_test, y_test)

# PHASE 2: pull in natl enq data
- mix in labels, source labels
- pull labels (source category in nyt)

- Review Nlp notes
  - Feature trans & Pipelines
- Gensim doc2vec

In [66]:
with open('mag-model-df.pkl', 'rb') as mag_data:
    df1 = pickle.load(mag_data)

In [67]:
df1.head()

Unnamed: 0,_id,date,headline,lead_paragraph,locations,source,subjects,url,word_count
0,5930ea13bb9b1724340d59af,,,Elephants Run To Greet A Baby Elephant And It’...,,http://www.trueactivist.com/,"[using, world, variety, treat, oilall, cbd, ze...",http://www.trueactivist.com/category/news/,0
1,5930ea13bb9b1724340d59b0,,,Cyclist Saves Tiny Kitten On The Road And Kitt...,,http://www.trueactivist.com/,"[tiny, spot, reaction, road, activism, theyre,...",http://www.trueactivist.com/category/news/acti...,0
2,5930ea13bb9b1724340d59b1,,,Cyclist Saves Tiny Kitten On The Road And Kitt...,,http://www.trueactivist.com/,"[tiny, spot, reaction, cruelty, road, animal, ...",http://www.trueactivist.com/category/news/anim...,0
3,5930ea13bb9b1724340d59b2,,,"Baskin Robbins, Dunkin’ Donuts To Remove Artif...",,http://www.trueactivist.com/,"[instance, robbins, flavoringartificial, impli...",http://www.trueactivist.com/category/news/econ...,0
4,5930ea13bb9b1724340d59b3,,,This Scientist Is Suing The Grand Canyon… For ...,,http://www.trueactivist.com/,"[pizza, newest, issue, food, waste, environmen...",http://www.trueactivist.com/category/news/envi...,0


In [68]:
df1.dropna(axis=0, how='all')

Unnamed: 0,_id,date,headline,lead_paragraph,locations,source,subjects,url,word_count
0,5930ea13bb9b1724340d59af,,,Elephants Run To Greet A Baby Elephant And It’...,,http://www.trueactivist.com/,"[using, world, variety, treat, oilall, cbd, ze...",http://www.trueactivist.com/category/news/,0
1,5930ea13bb9b1724340d59b0,,,Cyclist Saves Tiny Kitten On The Road And Kitt...,,http://www.trueactivist.com/,"[tiny, spot, reaction, road, activism, theyre,...",http://www.trueactivist.com/category/news/acti...,0
2,5930ea13bb9b1724340d59b1,,,Cyclist Saves Tiny Kitten On The Road And Kitt...,,http://www.trueactivist.com/,"[tiny, spot, reaction, cruelty, road, animal, ...",http://www.trueactivist.com/category/news/anim...,0
3,5930ea13bb9b1724340d59b2,,,"Baskin Robbins, Dunkin’ Donuts To Remove Artif...",,http://www.trueactivist.com/,"[instance, robbins, flavoringartificial, impli...",http://www.trueactivist.com/category/news/econ...,0
4,5930ea13bb9b1724340d59b3,,,This Scientist Is Suing The Grand Canyon… For ...,,http://www.trueactivist.com/,"[pizza, newest, issue, food, waste, environmen...",http://www.trueactivist.com/category/news/envi...,0
5,5930ea13bb9b1724340d59b4,,,High School Confirms That Teen’s Service Dog I...,,http://www.trueactivist.com/,"[putting, variety, teens, yearbookmost, health...",http://www.trueactivist.com/category/news/health/,0
6,5930ea13bb9b1724340d59b5,,,U.S. Is Killing More Civilians In Syrian War T...,,http://www.trueactivist.com/,"[killed, war, isby, theyve, politics, hours, a...",http://www.trueactivist.com/category/news/news...,0
7,5930ea13bb9b1724340d59b6,,,Women Are Turning To Clean Energy After Climat...,,http://www.trueactivist.com/,"[world, women, change, turning, climate, landi...",http://www.trueactivist.com/category/news/pove...,0
8,5930ea13bb9b1724340d59b7,,,This Scientist Is Suing The Grand Canyon… For ...,,http://www.trueactivist.com/,"[fabricated, long, whyscience, reasons, grand,...",http://www.trueactivist.com/category/news/rights/,0
9,5930ea13bb9b1724340d59b8,,,Elephants Run To Greet A Baby Elephant And It’...,,http://www.trueactivist.com/,"[intelligent, watchelephants, empathetic, maje...",http://www.trueactivist.com/category/news/life/,0


In [69]:
df1.shape

(1461, 9)

In [70]:
docs2 = df1['lead_paragraph']

In [71]:
docs2 = docs2.dropna()

In [72]:
for doc in docs2:
    doc = SentencesIterator(doc)

In [73]:
docs = pd.Series.tolist(docs2)

In [74]:
tfidf = TfidfVectorizer(stop_words="english", 
                        token_pattern="\\b[a-zA-Z][a-zA-Z]+\\b", 
                        min_df=10)

tfidf_vecs = tfidf.fit_transform(docs)

In [75]:
tfidf_vecs.shape

(1461, 4404)

## BASELINE: Multinomial Naive Bayes

In [76]:
pd.DataFrame(tfidf_vecs.todense(), 
             columns=tfidf.get_feature_names()
            ).head()

Unnamed: 0,abandon,abc,ability,abject,able,abortion,abortions,abroad,absence,absolute,...,years,yes,yesterday,york,young,younger,youth,youtube,zero,zone
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.245499,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.245499,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [77]:
# Train/Test split
X_train, X_test, y_train, y_test = train_test_split(tfidf_vecs, df1['source'], test_size=0.33)

# Train 
nb = MultinomialNB()
nb.fit(X_train, y_train)

# Test 
nb.score(X_test, y_test)

0.85300207039337472

## LSA Begin 2

In [78]:
# terms by docs instead of docs by terms
tfidf_corpus = matutils.Sparse2Corpus(tfidf_vecs.transpose())

# Row indices
id2word = dict((v, k) for k, v in tfidf.vocabulary_.items())

# This is a hack for Python 3!
id2word = corpora.Dictionary.from_corpus(tfidf_corpus, 
                                         id2word=id2word)

2017-06-02 07:08:48,190 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2017-06-02 07:08:48,467 : INFO : built Dictionary(4404 unique tokens: ['run', 'baby', 'like', 'scene', 'disney']...) from 1461 documents (total 10956 corpus positions)


In [79]:
lda = models.LdaModel(corpus=tfidf_corpus, num_topics=20, id2word=id2word, passes=3)

lda.print_topics()

2017-06-02 07:08:51,580 : INFO : using symmetric alpha at 0.05
2017-06-02 07:08:51,582 : INFO : using symmetric eta at 0.000227066303361
2017-06-02 07:08:51,585 : INFO : using serial LDA version on this node
2017-06-02 07:08:52,088 : INFO : running online LDA training, 20 topics, 3 passes over the supplied corpus of 1461 documents, updating model once every 1461 documents, evaluating perplexity every 1461 documents, iterating 50x with a convergence threshold of 0.001000
2017-06-02 07:09:01,771 : INFO : -22.825 per-word bound, 7432456.2 perplexity estimate based on a held-out corpus of 1461 documents with 10956 words
2017-06-02 07:09:01,773 : INFO : PROGRESS: pass 0, at document #1461/1461
2017-06-02 07:09:04,148 : INFO : topic #7 (0.050): 0.009*"trump" + 0.004*"president" + 0.003*"paris" + 0.003*"climate" + 0.003*"people" + 0.003*"donald" + 0.003*"said" + 0.003*"new" + 0.002*"agreement" + 0.002*"cancer"
2017-06-02 07:09:04,149 : INFO : topic #17 (0.050): 0.006*"trump" + 0.003*"springer

2017-06-02 07:09:30,772 : INFO : topic #16 (0.050): 0.005*"missile" + 0.005*"trump" + 0.004*"illness" + 0.004*"new" + 0.003*"mental" + 0.003*"india" + 0.003*"female" + 0.003*"body" + 0.003*"dangerous" + 0.003*"going"
2017-06-02 07:09:30,774 : INFO : topic #17 (0.050): 0.005*"santorum" + 0.005*"budget" + 0.004*"trump" + 0.004*"springer" + 0.004*"jerry" + 0.004*"climate" + 0.003*"native" + 0.003*"coal" + 0.003*"ryan" + 0.003*"solar"
2017-06-02 07:09:30,776 : INFO : topic #18 (0.050): 0.006*"modi" + 0.005*"trump" + 0.004*"picture" + 0.004*"climate" + 0.004*"know" + 0.003*"significant" + 0.003*"senator" + 0.003*"summit" + 0.003*"peterson" + 0.003*"weather"
2017-06-02 07:09:30,778 : INFO : topic #19 (0.050): 0.005*"trump" + 0.004*"wade" + 0.003*"pulse" + 0.003*"healthcare" + 0.002*"said" + 0.002*"passed" + 0.002*"police" + 0.002*"letter" + 0.002*"news" + 0.002*"video"


[(0,
  '0.009*"location" + 0.008*"trump" + 0.006*"transgender" + 0.006*"tweet" + 0.004*"switch" + 0.004*"people" + 0.003*"twitter" + 0.003*"good" + 0.003*"add" + 0.003*"button"'),
 (1,
  '0.015*"cookies" + 0.014*"agreeing" + 0.013*"privacy" + 0.011*"experience" + 0.011*"policy" + 0.010*"liberty" + 0.010*"using" + 0.010*"writers" + 0.010*"possible" + 0.009*"best"'),
 (2,
  '0.012*"heat" + 0.012*"feed" + 0.012*"delivered" + 0.011*"reading" + 0.011*"street" + 0.010*"latest" + 0.010*"thanks" + 0.009*"like" + 0.008*"trump" + 0.006*"video"'),
 (3,
  '0.004*"video" + 0.003*"trump" + 0.003*"opposing" + 0.003*"negative" + 0.003*"behaviors" + 0.002*"topics" + 0.002*"radio" + 0.002*"social" + 0.002*"emotions" + 0.002*"time"'),
 (4,
  '0.012*"facebook" + 0.010*"like" + 0.004*"low" + 0.004*"police" + 0.003*"ring" + 0.003*"wikileaks" + 0.003*"driving" + 0.003*"old" + 0.003*"music" + 0.003*"expose"'),
 (5,
  '0.013*"trump" + 0.005*"president" + 0.003*"reading" + 0.003*"did" + 0.003*"feed" + 0.003*"he

In [80]:
lda_corpus = lda[tfidf_corpus]

In [81]:
nyt_lda = matutils.corpus2dense(lda_corpus, num_terms=20).transpose()
df3 = pd.DataFrame(nyt_lda)

In [82]:
df3.mean().sort_values(ascending=False).head(10)

2     0.218217
13    0.099866
5     0.081756
11    0.066481
7     0.055374
10    0.045812
6     0.043371
16    0.041813
12    0.034047
0     0.033162
dtype: float32

# Future Work =====================================

# Troubleshoot doc2vec
- look into the output of this

In [45]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from pprint import pprint
import multiprocessing

In [44]:
# Create doc2Vec model
d2v = doc2vec.Doc2Vec(tfidf_corpus,min_count=3,workers=5)

2017-06-01 04:28:57,205 : INFO : collecting all words and their counts


AttributeError: 'list' object has no attribute 'words'

# PHASE 3: Visualize clusters
- [NLP visualization PyLDAvis](https://github.com/bmabey/pyLDAvis)
- [Bokeh](http://bokeh.pydata.org/en/latest/)
- [Bqplot](https://github.com/bloomberg/bqplot)
- I'd rather not d3...

In [25]:
with open('nyt-model-df.pkl', 'rb') as nyt_data:
    df = pickle.load(nyt_data)

In [26]:
with open('mag-model-df.pkl', 'rb') as mag_data:
    df1 = pickle.load(mag_data)

In [28]:
# select the relevant columns in our ratings dataset
nyt_df = df[['lead_paragraph', 'source']]
mag_df = df1[['lead_paragraph', 'source']]

In [31]:
# For the word cloud: https://www.jasondavies.com/wordcloud/
nyt_df['lead_paragraph'].to_csv(path='nyt-text.csv', index=False)

In [33]:
# For the word cloud: https://www.jasondavies.com/wordcloud/
mag_df['lead_paragraph'].to_csv(path='mag-text.csv', index=False)

In [32]:
!ls

2013_movies.csv                       challenge_set_5_andrew-RESUBMIT.ipynb
5.15.17 Sort, Search, Merge.ipynb     challenge_set_6_andrew-RESUBMIT.ipynb
Challenge_7+8_o.ipynb                 challenge_set_7_andrew.ipynb
NLP-model-development.ipynb           challenge_set_8_andrew.ipynb
NYT-Magazine Classifier.ipynb         challenge_set_X_TEMPLATE.ipynb
[1m[36mProj_Benson[m[m                           haberman.csv
[1m[36mProj_Luther[m[m                           house-votes-84.csv
Set7_Class_Models.pkl                 initial-model-df.pkl
Set7_House_Data.pkl                   mag-model-df.pkl
_challenge_7.ipynb                    mag.csv
challenge_set_15_andrew.ipynb         nyt-model-df.pkl
challenge_set_1_andrew.ipynb          nyt-text.csv
challenge_set_1_andrew_pandas.ipynb   [1m[36mtom_andrew[m[m
challenge_set_3_andrew-Copy1.ipynb
