**Potential Applications:** classify articles by topic

In [1]:
# From: http://brandonrose.org/clustering
import numpy as np
import pandas as pd
import nltk
import re
import os
import codecs
from sklearn import feature_extraction
import mpld3

In [2]:
articles = pd.read_pickle('ArticleMetadata.pkl')

articles.DatePublished = pd.to_datetime(articles.DatePublished)
articles.Tags = articles.Tags.map(lambda x: str(x))
articles.TagArray = articles.Tags.map(lambda x: x.split(','))
articles.TagArray[0]

  def _ipython_display_formatter_default(self):
  def _formatters_default(self):
  def _deferred_printers_default(self):
  def _singleton_printers_default(self):
  def _type_printers_default(self):
  def _singleton_printers_default(self):
  def _type_printers_default(self):
  def _deferred_printers_default(self):


['Capitalism',
 'Competition',
 'Property Rights',
 'Entrepreneurship',
 'Free Markets',
 'Market Process',
 'Biographies',
 'Innovation',
 'Arts and Music',
 'Technology']

In [13]:
articles.head(1).FullText

ArticleId
12897    "AMC's Halt and Catch Fire is a brilliant achi...
Name: FullText, dtype: object

## Stopwords, stemming, and tokenizing

In [3]:
# load nltk's English stopwords as variable called 'stopwords'
stopwords = nltk.corpus.stopwords.words('english')

In [7]:
stopwords[:10]

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your']

In [9]:
# load nltk's SnowballStemmer as variabled 'stemmer'
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")

In [8]:
# here I define a tokenizer and stemmer which returns the set of stems in the text that it is passed

def tokenize_and_stem(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    stems = [stemmer.stem(t) for t in filtered_tokens]
    return stems


def tokenize_only(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    return filtered_tokens

In [15]:
#use extend so it's a big flat list of vocab
totalvocab_stemmed = []
totalvocab_tokenized = []
for i in articles.FullText:
    allwords_stemmed = tokenize_and_stem(i) #for each item in 'synopses', tokenize/stem
    totalvocab_stemmed.extend(allwords_stemmed) #extend the 'totalvocab_stemmed' list
    
    allwords_tokenized = tokenize_only(i)
    totalvocab_tokenized.extend(allwords_tokenized)

In [16]:
allwords_stemmed

['the',
 'current',
 'american',
 'elect',
 'has',
 'me',
 'think',
 'a',
 'lot',
 'about',
 '17th',
 'centuri',
 'england',
 'the',
 '1650s',
 'were',
 'a',
 'time',
 'of',
 'unpreced',
 'upheav',
 'in',
 'england',
 'the',
 'decad',
 'was',
 'usher',
 'in',
 'with',
 'the',
 'public',
 'trial',
 'and',
 'execut',
 'of',
 'the',
 'king',
 'and',
 'the',
 'creation',
 'of',
 'an',
 'entir',
 'new',
 'form',
 'of',
 'govern',
 'vote',
 'in',
 'liter',
 'at',
 'gunpoint',
 'the',
 'aftermath',
 'of',
 'a',
 'long',
 'and',
 'bloodi',
 'civil',
 'war',
 'left',
 'the',
 'nation',
 "'s",
 'polit',
 'and',
 'religion',
 'uncertain',
 'and',
 'the',
 'nation',
 "'s",
 'peopl',
 'divid',
 'the',
 'poet',
 'robert',
 'herrick',
 'right',
 'said',
 'of',
 'the',
 'time',
 'that',
 'sick',
 'is',
 'the',
 'land',
 'to',
 'th',
 'heart',
 'and',
 'doth',
 'endur',
 'more',
 'danger',
 'faint',
 'by',
 'her',
 "desp'rat",
 'cure.â\x80\x9d',
 'in',
 'the',
 'middl',
 'of',
 'all',
 'the',
 'upheav'

In [18]:
vocab_frame = pd.DataFrame({'words': totalvocab_tokenized}, index = totalvocab_stemmed)
print('there are ' + str(vocab_frame.shape[0]) + ' items in vocab_frame')

there are 19239198 items in vocab_frame


In [21]:
vocab_frame.head(30)

Unnamed: 0,words
amc,amc
's,'s
halt,halt
and,and
catch,catch
fire,fire
is,is
a,a
brilliant,brilliant
achiev,achievement


## Tf-idf and document similarity

In [22]:
from sklearn.feature_extraction.text import TfidfVectorizer

#define vectorizer parameters
tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=200000,
                                 min_df=0.2, stop_words='english',
                                 use_idf=True, tokenizer=tokenize_and_stem, ngram_range=(1,3))

%time tfidf_matrix = tfidf_vectorizer.fit_transform(articles.FullText) #fit the vectorizer to synopses

print(tfidf_matrix.shape)

CPU times: user 12min 14s, sys: 9.03 s, total: 12min 23s
Wall time: 12min 37s
(13835, 399)


In [25]:
terms = tfidf_vectorizer.get_feature_names()

In [26]:
from sklearn.metrics.pairwise import cosine_similarity
dist = 1 - cosine_similarity(tfidf_matrix)
print
print

<function print>

## K-means clustering

In [None]:
from sklearn.cluster import KMeans

num_clusters = 5

km = KMeans(n_clusters=num_clusters)

%time km.fit(tfidf_matrix)

clusters = km.labels_.tolist()

In [None]:
from sklearn.externals import joblib

#uncomment the below to save your model 
#since I've already run my model I am loading from the pickle

#joblib.dump(km,  'doc_cluster.pkl')

km = joblib.load('doc_cluster.pkl')
clusters = km.labels_.tolist()

In [None]:
films = { 'title': titles, 'rank': ranks, 'synopsis': synopses, 'cluster': clusters, 'genre': genres }

frame = pd.DataFrame(films, index = [clusters] , columns = ['rank', 'title', 'cluster', 'genre'])

frame['cluster'].value_counts() #number of films per cluster (clusters from 0 to 4)

In [None]:
grouped = frame['rank'].groupby(frame['cluster']) #groupby cluster for aggregation purposes

grouped.mean() #average rank (1 to 100) per cluster

In [None]:
from __future__ import print_function

print("Top terms per cluster:")
print()
#sort cluster centers by proximity to centroid
order_centroids = km.cluster_centers_.argsort()[:, ::-1] 

for i in range(num_clusters):
    print("Cluster %d words:" % i, end='')
    
    for ind in order_centroids[i, :6]: #replace 6 with n words per cluster
        print(' %s' % vocab_frame.ix[terms[ind].split(' ')].values.tolist()[0][0].encode('utf-8', 'ignore'), end=',')
    print() #add whitespace
    print() #add whitespace
    
    print("Cluster %d titles:" % i, end='')
    for title in frame.ix[i]['title'].values.tolist():
        print(' %s,' % title, end='')
    print() #add whitespace
    print() #add whitespace
    
print()
print()