In [43]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/million-headlines/abcnews-date-text.csv


In [44]:
data=pd.read_csv("../input/million-headlines/abcnews-date-text.csv")

In [45]:
data_text=data[:5000][['headline_text']];

In [46]:
data_text['index']=data_text.index

In [47]:
documents=data_text

In [48]:
print(len(documents))

5000


In [49]:
documents.head()

Unnamed: 0,headline_text,index
0,aba decides against community broadcasting lic...,0
1,act fire witnesses must be aware of defamation,1
2,a g calls for infrastructure protection summit,2
3,air nz staff in aust strike for pay rise,3
4,air nz strike to affect australian travellers,4


In [50]:
documents.isnull().sum()

headline_text    0
index            0
dtype: int64

## Data Preprocessing

In [51]:
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

In [52]:
corpus=[]
for i in range(0,len(documents)):
    text=re.sub('[^a-zA-z]',' ',documents['headline_text'][i])
    text=text.lower()
    text=text.split()
    text=[PorterStemmer().stem(word) for word in text if not word in stopwords.words('english')]
    text=' '.join(text)
    corpus.append(text)
    

In [54]:
for i in range(0,len(corpus)):
    documents['headline_text'][i]=corpus[i]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [55]:
documents[["headline_text"]]

Unnamed: 0,headline_text
0,aba decid commun broadcast licenc
1,act fire wit must awar defam
2,g call infrastructur protect summit
3,air nz staff aust strike pay rise
4,air nz strike affect australian travel
...,...
4995,slater star blue day one
4996,soprano film delay contract disput
4997,souri outlin region road fund
4998,south east water license pay levi


In [61]:
processed_docs=documents['headline_text']

In [65]:
processed_docs

0             [aba, decid, commun, broadcast, licenc]
1                 [act, fire, wit, must, awar, defam]
2           [g, call, infrastructur, protect, summit]
3           [air, nz, staff, aust, strike, pay, rise]
4       [air, nz, strike, affect, australian, travel]
                            ...                      
4995                   [slater, star, blue, day, one]
4996         [soprano, film, delay, contract, disput]
4997              [souri, outlin, region, road, fund]
4998         [south, east, water, license, pay, levi]
4999         [sri, lanka, hope, new, zealand, defeat]
Name: headline_text, Length: 5000, dtype: object

## Bag of words on the dataset

In [66]:
import gensim


In [67]:
dictionary=gensim.corpora.Dictionary(processed_docs)

In [68]:
count=0
for k,v in dictionary.iteritems():
    print(k,v)
    count+=1
    if count>20:
        break

0 aba
1 broadcast
2 commun
3 decid
4 licenc
5 act
6 awar
7 defam
8 fire
9 must
10 wit
11 call
12 g
13 infrastructur
14 protect
15 summit
16 air
17 aust
18 nz
19 pay
20 rise


In [69]:
dictionary.filter_extremes(no_below=15,no_above=0.1,keep_n=1000)

In [70]:
bow_corpus=[dictionary.doc2bow(doc) for doc in processed_docs]

In [72]:
bow_corpus[100]

[(46, 1), (160, 1)]

In [75]:
bow_corpus_100=bow_corpus[100]
for i in range(len(bow_corpus_100)):
    print("Word {} (\"{}\") appears {} time.".format(bow_corpus_100[i][0], 
                                                     dictionary[bow_corpus_100[i][0]], 
                                                     bow_corpus_100[i][1]))

Word 46 ("urg") appears 1 time.
Word 160 ("women") appears 1 time.


## TF-IDF on our document set

In [77]:
from gensim import corpora,models
tfidf=models.TfidfModel(bow_corpus)

In [78]:
corpus_tfidf=tfidf[bow_corpus]

## Running LDA using Bag of Words

In [79]:
lda_model=gensim.models.LdaMulticore(bow_corpus,
                                    num_topics=10,
                                    id2word=dictionary,
                                    passes=2)

In [80]:
for idx,topic in lda_model.print_topics(-1):
    print("Topic: {} \nWords: {}".format(idx, topic))
    print("\n")

Topic: 0 
Words: 0.056*"govt" + 0.026*"council" + 0.024*"may" + 0.023*"drought" + 0.018*"vic" + 0.014*"end" + 0.013*"plan" + 0.012*"fight" + 0.012*"new" + 0.011*"rule"


Topic: 1 
Words: 0.045*"iraq" + 0.025*"us" + 0.022*"death" + 0.016*"missil" + 0.014*"hit" + 0.013*"play" + 0.013*"two" + 0.012*"driver" + 0.012*"attack" + 0.011*"trade"


Topic: 2 
Words: 0.023*"cup" + 0.021*"world" + 0.018*"lead" + 0.017*"die" + 0.016*"open" + 0.015*"kill" + 0.014*"ban" + 0.013*"woman" + 0.012*"call" + 0.012*"hope"


Topic: 3 
Words: 0.028*"polic" + 0.025*"say" + 0.022*"continu" + 0.022*"win" + 0.021*"plan" + 0.018*"court" + 0.014*"union" + 0.013*"crash" + 0.012*"air" + 0.012*"offer"


Topic: 4 
Words: 0.033*"call" + 0.030*"hospit" + 0.027*"polic" + 0.018*"back" + 0.017*"clash" + 0.016*"concern" + 0.015*"coast" + 0.015*"probe" + 0.014*"man" + 0.014*"qld"


Topic: 5 
Words: 0.041*"man" + 0.029*"new" + 0.019*"jail" + 0.019*"health" + 0.017*"iraq" + 0.013*"us" + 0.013*"franc" + 0.013*"top" + 0.012*"seaso

## Running LDA using TF-IDF

In [81]:
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, 
                                             num_topics=10, 
                                             id2word = dictionary, 
                                             passes = 2, 
                                             workers=4)

In [82]:
for idx, topic in lda_model_tfidf.print_topics(-1):
    print("Topic: {} Word: {}".format(idx, topic))
    print("\n")

Topic: 0 Word: 0.024*"us" + 0.024*"iraq" + 0.019*"lead" + 0.018*"work" + 0.017*"govt" + 0.016*"troop" + 0.015*"turn" + 0.015*"announc" + 0.013*"bushfir" + 0.013*"reach"


Topic: 1 Word: 0.032*"plan" + 0.031*"charg" + 0.022*"win" + 0.021*"man" + 0.020*"concern" + 0.018*"ban" + 0.018*"us" + 0.016*"fund" + 0.016*"aussi" + 0.015*"back"


Topic: 2 Word: 0.022*"meet" + 0.018*"hit" + 0.017*"get" + 0.016*"council" + 0.014*"plane" + 0.014*"court" + 0.014*"welcom" + 0.014*"terror" + 0.013*"nation" + 0.013*"indian"


Topic: 3 Word: 0.022*"nsw" + 0.021*"investig" + 0.020*"probe" + 0.018*"death" + 0.017*"commun" + 0.017*"move" + 0.014*"council" + 0.014*"fire" + 0.013*"offer" + 0.013*"missil"


Topic: 4 Word: 0.023*"call" + 0.019*"iraqi" + 0.015*"say" + 0.015*"cup" + 0.013*"hand" + 0.012*"cyclon" + 0.012*"bid" + 0.012*"issu" + 0.012*"order" + 0.011*"discuss"


Topic: 5 Word: 0.020*"polic" + 0.020*"price" + 0.019*"final" + 0.019*"one" + 0.017*"may" + 0.016*"clash" + 0.016*"name" + 0.015*"return" + 0.

## Performance evaluation by classifying sample document using LDA Bag of Words model

In [83]:
processed_docs[4310]

['ratepay', 'group', 'want', 'compulsori', 'local', 'govt', 'vote']

In [85]:
for index, score in sorted(lda_model[bow_corpus[4310]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model.print_topic(index, 10)))


Score: 0.5198260545730591	 
Topic: 0.028*"polic" + 0.025*"say" + 0.022*"continu" + 0.022*"win" + 0.021*"plan" + 0.018*"court" + 0.014*"union" + 0.013*"crash" + 0.012*"air" + 0.012*"offer"

Score: 0.3468012511730194	 
Topic: 0.033*"call" + 0.030*"hospit" + 0.027*"polic" + 0.018*"back" + 0.017*"clash" + 0.016*"concern" + 0.015*"coast" + 0.015*"probe" + 0.014*"man" + 0.014*"qld"

Score: 0.016675421968102455	 
Topic: 0.036*"us" + 0.022*"world" + 0.021*"council" + 0.019*"price" + 0.018*"iraq" + 0.016*"water" + 0.015*"final" + 0.014*"record" + 0.014*"west" + 0.013*"north"

Score: 0.01667441613972187	 
Topic: 0.056*"govt" + 0.026*"council" + 0.024*"may" + 0.023*"drought" + 0.018*"vic" + 0.014*"end" + 0.013*"plan" + 0.012*"fight" + 0.012*"new" + 0.011*"rule"

Score: 0.01667313650250435	 
Topic: 0.045*"iraq" + 0.025*"us" + 0.022*"death" + 0.016*"missil" + 0.014*"hit" + 0.013*"play" + 0.013*"two" + 0.012*"driver" + 0.012*"attack" + 0.011*"trade"

Score: 0.01667311042547226	 
Topic: 0.034*"polic