In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from nltk.corpus import wordnet as wn
from keras.preprocessing.text import Tokenizer

Using TensorFlow backend.


In [2]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS

In [3]:
df = pd.read_csv('abcnews-date-text.csv')
df.head()

Unnamed: 0,publish_date,headline_text
0,20030219,aba decides against community broadcasting lic...
1,20030219,act fire witnesses must be aware of defamation
2,20030219,a g calls for infrastructure protection summit
3,20030219,air nz staff in aust strike for pay rise
4,20030219,air nz strike to affect australian travellers


In [4]:
df_short = df.copy()
df_short = df_short.drop(['publish_date'], axis=1)
df_short.head()

Unnamed: 0,headline_text
0,aba decides against community broadcasting lic...
1,act fire witnesses must be aware of defamation
2,a g calls for infrastructure protection summit
3,air nz staff in aust strike for pay rise
4,air nz strike to affect australian travellers


In [6]:
df.iat[0,1]

'aba decides against community broadcasting licence'

## Data Preprocessing

In [8]:
df['headline_text'].dropna(inplace=True)
#removing numbers from string
df['headline_text'] = df['headline_text'].str.replace('\d+', '')

df['headline_text'] = [entry.lower() for entry in df['headline_text']]
#df['headline']= [word_tokenize(entry) for entry in df['headline']]
df.head()


Unnamed: 0,publish_date,headline_text
0,20030219,aba decides against community broadcasting lic...
1,20030219,act fire witnesses must be aware of defamation
2,20030219,a g calls for infrastructure protection summit
3,20030219,air nz staff in aust strike for pay rise
4,20030219,air nz strike to affect australian travellers


In [9]:
from nltk.stem.porter import PorterStemmer 
from nltk.tokenize import word_tokenize 
stemmer = PorterStemmer() 
  
# stem words in the list of tokenised words 
def stem_words(text): 
    word_tokens = word_tokenize(text) 
    stems = [stemmer.stem(word) for word in word_tokens] 
    return stems 
df2= df.copy()
df2['headline_text'] = [stem_words(entry) for entry in df2['headline_text']]
df2.head()

Unnamed: 0,publish_date,headline_text
0,20030219,"[aba, decid, against, commun, broadcast, licenc]"
1,20030219,"[act, fire, wit, must, be, awar, of, defam]"
2,20030219,"[a, g, call, for, infrastructur, protect, summit]"
3,20030219,"[air, nz, staff, in, aust, strike, for, pay, r..."
4,20030219,"[air, nz, strike, to, affect, australian, travel]"


In [30]:
from nltk.corpus import stopwords
stops = set(stopwords.words("english"))                  
#df2=df.copy()
def remove_stops(row):
    my_list = row['headline_text']
    meaningful_words = [w for w in my_list if not w in stops]
    return (meaningful_words)
df3=df2.copy()
df3['headline_text'] = df3.apply(remove_stops, axis=1)
df3.head()

Unnamed: 0,publish_date,headline_text
0,20030219,"[aba, decid, commun, broadcast, licenc]"
1,20030219,"[act, fire, wit, must, awar, defam]"
2,20030219,"[g, call, infrastructur, protect, summit]"
3,20030219,"[air, nz, staff, aust, strike, pay, rise]"
4,20030219,"[air, nz, strike, affect, australian, travel]"


In [31]:
df6=df3.copy()
df6= df6.drop(['publish_date'], axis=1)
df6.head()

Unnamed: 0,headline_text
0,"[aba, decid, commun, broadcast, licenc]"
1,"[act, fire, wit, must, awar, defam]"
2,"[g, call, infrastructur, protect, summit]"
3,"[air, nz, staff, aust, strike, pay, rise]"
4,"[air, nz, strike, affect, australian, travel]"


In [16]:
def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [32]:
dictionary = gensim.corpora.Dictionary(df6['headline_text'])
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

0 aba
1 broadcast
2 commun
3 decid
4 licenc
5 act
6 awar
7 defam
8 fire
9 must
10 wit


In [33]:
bow_corpus = [dictionary.doc2bow(doc) for doc in df6['headline_text']]
bow_corpus[4000]


[(151, 1), (560, 1), (1385, 1), (4636, 1)]

In [34]:
from gensim import corpora, models
tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]
from pprint import pprint
for doc in corpus_tfidf:
    pprint(doc)
    break

[(0, 0.5929540978827899),
 (1, 0.4753689825619525),
 (2, 0.3101866300328752),
 (3, 0.40056040343663335),
 (4, 0.4071429552006896)]


## Implementing Models

### LDA using bag of words

In [35]:
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=10, id2word=dictionary)

In [36]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.017*"market" + 0.013*"tasmania" + 0.012*"price" + 0.012*"open" + 0.011*"share" + 0.010*"victoria" + 0.009*"island" + 0.009*"christma" + 0.008*"storm" + 0.008*"campaign"
Topic: 1 
Words: 0.045*"new" + 0.013*"council" + 0.013*"chang" + 0.012*"health" + 0.011*"say" + 0.009*"school" + 0.009*"indigen" + 0.008*"servic" + 0.008*"meet" + 0.008*"worker"
Topic: 2 
Words: 0.036*"polic" + 0.022*"man" + 0.019*"die" + 0.019*"crash" + 0.018*"car" + 0.015*"death" + 0.013*"investig" + 0.012*"woman" + 0.011*"driver" + 0.011*"attack"
Topic: 3 
Words: 0.030*"man" + 0.026*"court" + 0.024*"charg" + 0.020*"year" + 0.020*"murder" + 0.019*"interview" + 0.018*"face" + 0.015*"found" + 0.013*"accus" + 0.012*"sex"
Topic: 4 
Words: 0.026*"govern" + 0.019*"nsw" + 0.017*"rural" + 0.014*"qld" + 0.014*"say" + 0.014*"state" + 0.013*"nation" + 0.011*"labor" + 0.010*"protest" + 0.010*"support"
Topic: 5 
Words: 0.025*"australia" + 0.020*"trump" + 0.019*"win" + 0.017*"day" + 0.014*"one" + 0.014*"first" + 

In [22]:
for index, score in sorted(lda_model[bow_corpus[4310]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model.print_topic(index, 10)))


Score: 0.4224846363067627	 
Topic: 0.031*"to" + 0.023*"in" + 0.016*"market" + 0.013*"live" + 0.012*"on" + 0.012*"rise" + 0.011*"farmer" + 0.011*"price" + 0.011*"tasmania" + 0.010*"rate"

Score: 0.4173772633075714	 
Topic: 0.041*"in" + 0.031*"the" + 0.025*"of" + 0.024*"australia" + 0.019*"to" + 0.015*"a" + 0.014*"for" + 0.014*"win" + 0.013*"out" + 0.012*"day"

Score: 0.020026937127113342	 
Topic: 0.040*"to" + 0.037*"for" + 0.017*"countri" + 0.014*"hour" + 0.011*"call" + 0.009*"of" + 0.009*"a" + 0.009*"more" + 0.008*"guilti" + 0.008*"tasmanian"

Score: 0.020018765702843666	 
Topic: 0.061*"to" + 0.021*"for" + 0.016*"rural" + 0.015*"north" + 0.013*"in" + 0.011*"fund" + 0.011*"health" + 0.010*"on" + 0.009*"say" + 0.009*"new"

Score: 0.02001785673201084	 
Topic: 0.051*"to" + 0.039*"be" + 0.037*"for" + 0.012*"in" + 0.012*"miss" + 0.010*"may" + 0.010*"protest" + 0.010*"have" + 0.009*"search" + 0.008*"of"

Score: 0.020016200840473175	 
Topic: 0.029*"of" + 0.017*"govern" + 0.017*"over" + 0.015*

In [38]:
from gensim.models import CoherenceModel
# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=df6['headline_text'], dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Coherence Score:  0.21798069741739484


In [46]:
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=35, id2word=dictionary)
from gensim.models import CoherenceModel
# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=df6['headline_text'], dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)



Coherence Score:  0.2756025676076595


In [50]:
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=45, id2word=dictionary)
from gensim.models import CoherenceModel
# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=df6['headline_text'], dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)



Coherence Score:  0.28771298038709


In [49]:
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=55, id2word=dictionary)
from gensim.models import CoherenceModel
# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=df6['headline_text'], dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)



Coherence Score:  0.29872410356567186


### LDA using TF_IDF

In [39]:
from gensim import corpora, models
tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]
from pprint import pprint
for doc in corpus_tfidf:
    pprint(doc)
    break

[(0, 0.5929540978827899),
 (1, 0.4753689825619525),
 (2, 0.3101866300328752),
 (3, 0.40056040343663335),
 (4, 0.4071429552006896)]


In [40]:
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=10, id2word=dictionary)
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.021*"trump" + 0.019*":" + 0.010*"australia" + 0.009*"day" + 0.007*"live" + 0.007*"world" + 0.006*"test" + 0.006*"energi" + 0.006*"win" + 0.006*"juli"
Topic: 1 Word: 0.022*"man" + 0.015*"polic" + 0.014*"charg" + 0.012*"murder" + 0.011*"woman" + 0.011*"crash" + 0.009*"court" + 0.009*"car" + 0.008*"found" + 0.008*"drum"
Topic: 2 Word: 0.010*"turnbul" + 0.009*"elect" + 0.007*"labor" + 0.006*"marriag" + 0.006*"abus" + 0.006*"$" + 0.006*"liber" + 0.006*"abbott" + 0.006*"malcolm" + 0.006*"royal"
Topic: 3 Word: 0.015*"news" + 0.011*"abc" + 0.011*"rural" + 0.009*"nrl" + 0.008*"christma" + 0.008*"sport" + 0.008*"nation" + 0.007*"friday" + 0.007*"septemb" + 0.007*"peter"
Topic: 4 Word: 0.008*"grandstand" + 0.007*"us" + 0.006*"islam" + 0.006*"kill" + 0.006*"terror" + 0.005*"refuge" + 0.005*"syria" + 0.005*"australian" + 0.005*"australia" + 0.005*"attack"
Topic: 5 Word: 0.007*"stori" + 0.007*"decemb" + 0.006*"quiz" + 0.005*":" + 0.005*"survey" + 0.005*"patient" + 0.005*"grand" + 0.

In [41]:
for index, score in sorted(lda_model_tfidf[bow_corpus[4310]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model_tfidf.print_topic(index, 10)))


Score: 0.5366288423538208	 
Topic: 0.012*"north" + 0.011*"queensland" + 0.010*"market" + 0.007*"share" + 0.007*"south" + 0.007*"west" + 0.007*"whi" + 0.006*"price" + 0.006*"australian" + 0.006*"rise"

Score: 0.30329087376594543	 
Topic: 0.007*"stori" + 0.007*"decemb" + 0.006*"quiz" + 0.005*":" + 0.005*"survey" + 0.005*"patient" + 0.005*"grand" + 0.005*"anim" + 0.004*"afl" + 0.004*"wait"

Score: 0.02001289092004299	 
Topic: 0.015*"news" + 0.011*"abc" + 0.011*"rural" + 0.009*"nrl" + 0.008*"christma" + 0.008*"sport" + 0.008*"nation" + 0.007*"friday" + 0.007*"septemb" + 0.007*"peter"

Score: 0.020012356340885162	 
Topic: 0.023*"countri" + 0.021*"hour" + 0.011*"podcast" + 0.007*"wa" + 0.007*"nsw" + 0.006*"rural" + 0.005*"sa" + 0.005*"univers" + 0.005*"jame" + 0.005*"qld"

Score: 0.020010607317090034	 
Topic: 0.010*"turnbul" + 0.009*"elect" + 0.007*"labor" + 0.006*"marriag" + 0.006*"abus" + 0.006*"$" + 0.006*"liber" + 0.006*"abbott" + 0.006*"malcolm" + 0.006*"royal"

Score: 0.02001049183309

In [42]:
from gensim.models import CoherenceModel
# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model_tfidf, texts=df6['headline_text'], dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Coherence Score:  0.33752818000333545


In [43]:
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=20, id2word=dictionary)
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.019*"govern" + 0.016*"hill" + 0.016*"hunter" + 0.013*"station" + 0.013*"novemb" + 0.012*"energi" + 0.010*"project" + 0.010*"ga" + 0.009*"billion" + 0.009*"capit"
Topic: 1 Word: 0.020*"royal" + 0.018*"whi" + 0.017*"commiss" + 0.014*"rugbi" + 0.012*"video" + 0.011*"shot" + 0.009*"rape" + 0.008*"polic" + 0.008*"homeless" + 0.008*"invest"
Topic: 2 Word: 0.026*"turnbul" + 0.014*"monday" + 0.014*"stori" + 0.014*"yo" + 0.011*"white" + 0.010*"major" + 0.010*"india" + 0.010*"insid" + 0.009*"fiji" + 0.008*"clinton"
Topic: 3 Word: 0.016*"health" + 0.014*"speak" + 0.013*"mental" + 0.011*"smith" + 0.010*"retir" + 0.010*"png" + 0.009*"univers" + 0.008*"memori" + 0.008*"manu" + 0.008*"pari"
Topic: 4 Word: 0.018*"price" + 0.017*"market" + 0.015*"rate" + 0.015*"farm" + 0.014*"share" + 0.014*"dollar" + 0.012*"rise" + 0.010*"care" + 0.010*"age" + 0.010*"newcastl"
Topic: 5 Word: 0.025*"man" + 0.021*"interview" + 0.021*"murder" + 0.020*"charg" + 0.016*"donald" + 0.015*"sex" + 0.013*"assaul

In [44]:
coherence_model_lda = CoherenceModel(model=lda_model_tfidf, texts=df6['headline_text'], dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Coherence Score:  0.3731153590198484


In [45]:
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=35, id2word=dictionary)
coherence_model_lda = CoherenceModel(model=lda_model_tfidf, texts=df6['headline_text'], dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Coherence Score:  0.4759197352623956


In [47]:
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=45, id2word=dictionary)
coherence_model_lda = CoherenceModel(model=lda_model_tfidf, texts=df6['headline_text'], dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Coherence Score:  0.8491633194086815


In [48]:
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=55, id2word=dictionary)
coherence_model_lda = CoherenceModel(model=lda_model_tfidf, texts=df6['headline_text'], dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Coherence Score:  0.8491633194086815
