# Topic Modelling with Latend Dirichlet Allocation

### Step 1. Load the data

In [1]:
# Loading the dataset from a csv
import pandas as pd
data = pd.read_csv('abcnews-date-text.csv', error_bad_lines=False);
data_text = data[:300000][['headline_text']];
data_text['index'] = data_text.index

documents = data_text

# Total number of documents
print(f'The total number of documents is: {len(documents)}')



  data = pd.read_csv('abcnews-date-text.csv', error_bad_lines=False);


The total number of documents is: 300000


In [2]:
documents.head()

Unnamed: 0,headline_text,index
0,aba decides against community broadcasting lic...,0
1,act fire witnesses must be aware of defamation,1
2,a g calls for infrastructure protection summit,2
3,air nz staff in aust strike for pay rise,3
4,air nz strike to affect australian travellers,4


In [3]:
!git push

Everything up-to-date


### Step 2. Imports and data preprocessing

In [4]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
import nltk
nltk.download('wordnet')
np.random.seed(400)

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [5]:
# Lemmatizer example
print(WordNetLemmatizer().lemmatize('went', pos = 'v'))

go


In [6]:
# Stemmer example
stemmer = SnowballStemmer("english")
original_words = ['caresses', 'flies', 'dies', 'mules', 'denied','died', 'agreed', 'owned', 
           'humbled', 'sized','meeting', 'stating', 'siezing', 'itemization','sensational', 
           'traditional', 'reference', 'colonizer','plotted']
singles = [stemmer.stem(plural) for plural in original_words]

pd.DataFrame(data={'original word':original_words, 'stemmed':singles })

Unnamed: 0,original word,stemmed
0,caresses,caress
1,flies,fli
2,dies,die
3,mules,mule
4,denied,deni
5,died,die
6,agreed,agre
7,owned,own
8,humbled,humbl
9,sized,size


In [7]:
# Stemming and Lematization on the entire dataset

def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

# Tokenize and lemmatize

def preprocess(text):
    result=[]
    for token in gensim.utils.simple_preprocess(text) :
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [8]:
# Document example after preprocessing

document_num = 0
doc_sample = documents[documents['index'] == 0].values[0][0]

print("Original document: ")
words = []
for word in doc_sample.split(' '):
    words.append(word)
print(words)
print("\n\nTokenized and lemmatized document: ")
print(preprocess(doc_sample))

Original document: 
['aba', 'decides', 'against', 'community', 'broadcasting', 'licence']


Tokenized and lemmatized document: 
['decid', 'communiti', 'broadcast', 'licenc']


In [9]:
# Preprocess headlines
processed_docs = documents['headline_text'].map(preprocess)
processed_docs[:3]

0     [decid, communiti, broadcast, licenc]
1                        [wit, awar, defam]
2    [call, infrastructur, protect, summit]
Name: headline_text, dtype: object

### Step 3. Bag of Words approach on the dataset

In [10]:
# Create a dictionary of word counts
dictionary = gensim.corpora.Dictionary(processed_docs)

In [11]:
# Remove too rare and too common words from the dictionary
dictionary.filter_extremes(no_below=5, no_above=0.5, keep_n=100000)

In [12]:
# Initialize the bag of words corpus using the word counts
bow_corpus = [dictionary.doc2bow(document) for document in processed_docs]
bow_corpus[document_num]

[(0, 1), (1, 1), (2, 1), (3, 1)]

In [13]:
# Bag of words document example
bow_doc_4310 = bow_corpus[document_num]

for i in range(len(bow_doc_4310)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_4310[i][0], 
                                                     dictionary[bow_doc_4310[i][0]], 
                                                     bow_doc_4310[i][1]))

Word 0 ("broadcast") appears 1 time.
Word 1 ("communiti") appears 1 time.
Word 2 ("decid") appears 1 time.
Word 3 ("licenc") appears 1 time.


### Step 4. TF-IDF approach on the dataset

In [14]:
from gensim import corpora, models
# tfidf model and corpus
tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]

In [15]:
# tfidf scores
from pprint import pprint
for doc in corpus_tfidf:
    pprint(doc)
    break

[(0, 0.5959813347777092),
 (1, 0.39204529549491984),
 (2, 0.48531419274988147),
 (3, 0.5055461098578569)]


### Step 5. LDA using Bag of Words

In [16]:
# Initialize the LDA model using Multicore approach from the gensim library.
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics = 10, id2word = dictionary, passes = 50)

In [17]:
# For each topic explore the words occurence for that topic and their relative weights
for idx, topic in lda_model.print_topics(-1):
    print("Topic: {} \nWords: {}".format(topic, idx ))
    print("\n")

Topic: 0.024*"call" + 0.021*"probe" + 0.019*"talk" + 0.017*"say" + 0.017*"hold" + 0.016*"work" + 0.014*"chief" + 0.014*"leader" + 0.012*"inquiri" + 0.011*"launch" 
Words: 0


Topic: 0.027*"claim" + 0.020*"govt" + 0.018*"minist" + 0.016*"nation" + 0.016*"worker" + 0.015*"labor" + 0.014*"protest" + 0.014*"union" + 0.014*"defend" + 0.014*"school" 
Words: 1


Topic: 0.038*"report" + 0.029*"hospit" + 0.019*"coast" + 0.016*"open" + 0.015*"gold" + 0.012*"deal" + 0.012*"guilti" + 0.011*"sign" + 0.011*"find" + 0.010*"bushfir" 
Words: 2


Topic: 0.028*"fund" + 0.026*"govt" + 0.022*"water" + 0.020*"boost" + 0.016*"servic" + 0.014*"urg" + 0.014*"health" + 0.013*"farmer" + 0.012*"price" + 0.011*"drought" 
Words: 3


Topic: 0.016*"australia" + 0.016*"lead" + 0.015*"world" + 0.014*"win" + 0.012*"final" + 0.011*"test" + 0.010*"aussi" + 0.010*"england" + 0.009*"play" + 0.008*"clash" 
Words: 4


Topic: 0.064*"polic" + 0.035*"charg" + 0.031*"court" + 0.027*"face" + 0.019*"jail" + 0.018*"drug" + 0.017*"mu

### Step 6. LDA using TF-IDF approach

In [18]:
# Initialize the LDA model using Multicore approach from the gensim library.
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics = 10, id2word = dictionary, passes = 50)

In [19]:
# For each topic explore the words occurence for that topic and their relative weights
for idx, topic in lda_model_tfidf.print_topics(-1):
    print("Topic: {} Word: {}".format(idx, topic))
    print("\n")

Topic: 0 Word: 0.008*"award" + 0.008*"nuclear" + 0.008*"korea" + 0.007*"talk" + 0.007*"deal" + 0.007*"downer" + 0.006*"iran" + 0.006*"murray" + 0.006*"china" + 0.006*"trade"


Topic: 1 Word: 0.013*"price" + 0.012*"rise" + 0.011*"market" + 0.009*"rain" + 0.007*"rate" + 0.007*"farmer" + 0.007*"high" + 0.006*"water" + 0.006*"profit" + 0.006*"drought"


Topic: 2 Word: 0.012*"toll" + 0.010*"bird" + 0.008*"fish" + 0.007*"illeg" + 0.006*"rebel" + 0.006*"death" + 0.005*"road" + 0.005*"crackdown" + 0.005*"news" + 0.005*"human"


Topic: 3 Word: 0.025*"crash" + 0.015*"polic" + 0.012*"investig" + 0.011*"die" + 0.010*"accid" + 0.010*"victim" + 0.009*"fatal" + 0.008*"blaze" + 0.007*"death" + 0.007*"plane"


Topic: 4 Word: 0.024*"charg" + 0.021*"court" + 0.020*"polic" + 0.016*"murder" + 0.014*"jail" + 0.014*"drug" + 0.012*"face" + 0.010*"assault" + 0.009*"accus" + 0.009*"search"


Topic: 5 Word: 0.011*"final" + 0.009*"gold" + 0.008*"open" + 0.008*"drink" + 0.007*"coast" + 0.007*"world" + 0.005*"game"

### Step 7. Performance evaluation on Bag of Words

In [20]:
# Example document
processed_docs[0]

['decid', 'communiti', 'broadcast', 'licenc']

In [21]:
document_num = 0
# Doment test with the LDA model
for index, score in sorted(lda_model[bow_corpus[document_num]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model.print_topic(index, 10)))


Score: 0.8199866414070129	 
Topic: 0.062*"plan" + 0.049*"council" + 0.019*"consid" + 0.018*"concern" + 0.015*"govt" + 0.014*"group" + 0.012*"resid" + 0.012*"mayor" + 0.011*"develop" + 0.010*"park"

Score: 0.020001880824565887	 
Topic: 0.028*"fund" + 0.026*"govt" + 0.022*"water" + 0.020*"boost" + 0.016*"servic" + 0.014*"urg" + 0.014*"health" + 0.013*"farmer" + 0.012*"price" + 0.011*"drought"

Score: 0.020001878961920738	 
Topic: 0.016*"australia" + 0.016*"lead" + 0.015*"world" + 0.014*"win" + 0.012*"final" + 0.011*"test" + 0.010*"aussi" + 0.010*"england" + 0.009*"play" + 0.008*"clash"

Score: 0.02000136487185955	 
Topic: 0.024*"call" + 0.021*"probe" + 0.019*"talk" + 0.017*"say" + 0.017*"hold" + 0.016*"work" + 0.014*"chief" + 0.014*"leader" + 0.012*"inquiri" + 0.011*"launch"

Score: 0.02000136487185955	 
Topic: 0.027*"claim" + 0.020*"govt" + 0.018*"minist" + 0.016*"nation" + 0.016*"worker" + 0.015*"labor" + 0.014*"protest" + 0.014*"union" + 0.014*"defend" + 0.014*"school"

Score: 0.0200

### Step 8. Performance evaluation on TF-IDF

In [22]:
# Doment test with the LDA model
for index, score in sorted(lda_model_tfidf[bow_corpus[document_num]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model_tfidf.print_topic(index, 10)))


Score: 0.4290580153465271	 
Topic: 0.012*"toll" + 0.010*"bird" + 0.008*"fish" + 0.007*"illeg" + 0.006*"rebel" + 0.006*"death" + 0.005*"road" + 0.005*"crackdown" + 0.005*"news" + 0.005*"human"

Score: 0.4108455777168274	 
Topic: 0.017*"council" + 0.015*"plan" + 0.011*"govt" + 0.011*"water" + 0.007*"urg" + 0.007*"mayor" + 0.007*"develop" + 0.006*"fund" + 0.006*"group" + 0.006*"communiti"

Score: 0.020026210695505142	 
Topic: 0.020*"iraq" + 0.018*"kill" + 0.011*"troop" + 0.010*"bomb" + 0.010*"iraqi" + 0.008*"attack" + 0.008*"soldier" + 0.007*"blast" + 0.007*"baghdad" + 0.007*"terror"

Score: 0.020011257380247116	 
Topic: 0.012*"govt" + 0.011*"health" + 0.010*"fund" + 0.008*"union" + 0.007*"labor" + 0.007*"urg" + 0.006*"servic" + 0.006*"plan" + 0.006*"indigen" + 0.006*"opposit"

Score: 0.020010722801089287	 
Topic: 0.019*"closer" + 0.009*"england" + 0.008*"tiger" + 0.007*"aussi" + 0.006*"victori" + 0.006*"lead" + 0.006*"australia" + 0.006*"test" + 0.006*"black" + 0.005*"blue"

Score: 0.02

### Step 9. Testing model on an unseen document

In [23]:
unseen_document = "My favorite sports activities are running and swimming."

# Data preprocessing step for the unseen document
bow_vector = dictionary.doc2bow(preprocess(unseen_document))

for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]):
    print("Score: {}\t Topic: {}".format(score, lda_model.print_topic(index, 5)))

Score: 0.22005243599414825	 Topic: 0.016*"year" + 0.016*"market" + 0.016*"help" + 0.015*"record" + 0.014*"law"
Score: 0.22003579139709473	 Topic: 0.037*"kill" + 0.028*"crash" + 0.018*"death" + 0.017*"road" + 0.016*"polic"
Score: 0.22003096342086792	 Topic: 0.062*"plan" + 0.049*"council" + 0.019*"consid" + 0.018*"concern" + 0.015*"govt"
Score: 0.21982374787330627	 Topic: 0.028*"fund" + 0.026*"govt" + 0.022*"water" + 0.020*"boost" + 0.016*"servic"
Score: 0.020011626183986664	 Topic: 0.029*"miss" + 0.029*"continu" + 0.022*"forc" + 0.018*"search" + 0.017*"iraq"
Score: 0.020011186599731445	 Topic: 0.024*"call" + 0.021*"probe" + 0.019*"talk" + 0.017*"say" + 0.017*"hold"
Score: 0.020008552819490433	 Topic: 0.027*"claim" + 0.020*"govt" + 0.018*"minist" + 0.016*"nation" + 0.016*"worker"
Score: 0.020008552819490433	 Topic: 0.038*"report" + 0.029*"hospit" + 0.019*"coast" + 0.016*"open" + 0.015*"gold"
Score: 0.020008552819490433	 Topic: 0.016*"australia" + 0.016*"lead" + 0.015*"world" + 0.014*"win

In [None]:
!git add .
!git commit -m ""