In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [2]:
for i in range(1,88):
    if i == 1:
        batch = pd.read_csv('./batch{}.csv'.format(i), index_col='index')
        batch.drop(batch.columns[0], axis=1, inplace=True)
    else:
        csv = pd.read_csv('./batch{}.csv'.format(i), index_col='index')
        csv.drop(csv.columns[0], axis=1, inplace=True)
        batch = pd.concat([batch, csv])

In [4]:
df=pd.read_csv('./HealthMap_Scraped.csv')

In [5]:
df.Headline.head()

0    Kutcher tells Council about water situation - ...
1    Under boil order, Murray awaits water test res...
2    Puerto Ricans at Risk of Waterborne Disease Ou...
3    Howes: In Flint water war, city only has one c...
4    DEP orders PWSA to make critical infrastrucure...
Name: Headline, dtype: object

In [6]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from nltk.stem import PorterStemmer
import string
from nltk.corpus import stopwords

In [7]:
# Code taken from Richard Harris
def cleaner(text):
    stemmer = PorterStemmer()
    stop = stopwords.words('english')
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = text.translate(str.maketrans('', '', string.digits))
    text = text.lower().strip()
    final_text = []
    for w in text.split():
        if w not in stop:
            final_text.append(stemmer.stem(w.strip()))
    return ' '.join(final_text)

In [8]:
text = pd.DataFrame(df['Headline'])
text

Unnamed: 0,Headline
0,Kutcher tells Council about water situation - ...
1,"Under boil order, Murray awaits water test res..."
2,Puerto Ricans at Risk of Waterborne Disease Ou...
3,"Howes: In Flint water war, city only has one c..."
4,DEP orders PWSA to make critical infrastrucure...
5,Private water wells in Texas test positive for...
6,How long it could take to repair massive Oakla...
7,"Boil water advisories issued for Clinton, Wils..."
8,Happening Now BREAKING- Boil Water Order Lifte...
9,Boil water order lifted in Astoria - Daily Ast...


In [9]:
cv = CountVectorizer(preprocessor=cleaner)
X = cv.fit_transform(text['Headline'].values)
X

<7724x5403 sparse matrix of type '<class 'numpy.int64'>'
	with 61650 stored elements in Compressed Sparse Row format>

In [10]:
feature_names = cv.get_feature_names()
feature_names

['ab',
 'abat',
 'abbotsford',
 'abc',
 'abcchicagocom',
 'abccolumbiacom',
 'abccom',
 'abcnewscom',
 'abconyoursidecom',
 'aberdeen',
 'abid',
 'abilen',
 'abita',
 'abq',
 'absaroke',
 'acacio',
 'acadianalafayett',
 'access',
 'accid',
 'accord',
 'accur',
 'achiev',
 'ackley',
 'acr',
 'across',
 'action',
 'actionnewsjaxcom',
 'activ',
 'ad',
 'ada',
 'adair',
 'adam',
 'adamsvil',
 'add',
 'addison',
 'addit',
 'address',
 'adel',
 'adirondack',
 'adjac',
 'adjust',
 'administr',
 'adrian',
 'advanc',
 'advantag',
 'advertis',
 'advis',
 'adviso',
 'advisori',
 'advisory',
 'advistori',
 'advoc',
 'af',
 'affect',
 'africa',
 'aftermath',
 'afternoon',
 'age',
 'agenc',
 'ago',
 'ahead',
 'aiken',
 'air',
 'airi',
 'airport',
 'akhiok',
 'akron',
 'alabama',
 'alabamascom',
 'alamo',
 'alamogordo',
 'alaska',
 'alban',
 'albani',
 'albert',
 'albion',
 'albuquerqu',
 'alcest',
 'alcom',
 'alden',
 'alert',
 'alexand',
 'alexandria',
 'alexi',
 'alga',
 'algier',
 'algonquin',
 '

In [16]:
lda = LatentDirichletAllocation(n_topics=6)

lda.fit(X)



LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7, learning_method=None,
             learning_offset=10.0, max_doc_update_iter=100, max_iter=10,
             mean_change_tol=0.001, n_components=10, n_jobs=1, n_topics=6,
             perp_tol=0.1, random_state=None, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)

In [17]:
print(lda.components_.shape)

(6, 5403)


In [18]:
results = pd.DataFrame(lda.components_, columns=feature_names)

In [19]:
# Use less than 6
for topic in range(6):
    print('Topic', topic)
    word_list = results.T[topic].sort_values(ascending=False).index
    print(' '.join(word_list[0:25]), '\n')

Topic 0
beach bacteria close state due found water high news level abc swim coli shore test sever new two journal ask njcom jersey problem advisori lake 

Topic 1
jacksonvil still effect mt without newstalk missourian wcsi chronicl height grove eagl lexington oak hill newsjourn logan southeast use estat weather interrupt free pine ky 

Topic 2
water boil order boilwat advisori lift issu news resid press counti time daili fall drink citi republican lake jackson cancel offici herald plattsburgh democrat releas 

Topic 3
boil water order issu advisori lift notic counti break news main custom citi daili part resid area effect gazett kfv journal charleston district alert subscript 

Topic 4
beach close wboytv valley park north road street blog republ affect lake wsaztv press contamin newsday island ktb health west review lafayett report ecoli sewag 

Topic 5
patchcom tribun time town public school creek hill beach outbreak coli advis ledger record san news local counti close three daili lea