In [96]:
#Install plLDAvis to visualize topic modelling
!pip install pyLDAvis



In [97]:
!pip install --upgrade pandas

Requirement already up-to-date: pandas in /usr/local/lib/python3.7/dist-packages (1.2.4)


In [98]:
import re
import numpy as np
import pandas as pd
from pprint import pprint

# Gensim packages
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy for lemmatization
import spacy

# Plotting tools for LDA
import pyLDAvis
import pyLDAvis.gensim_models  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

In [99]:
#NLTK for NLP
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [100]:
#Read csv file into dataframe
df= pd.read_csv('/content/a.csv')

In [101]:
print(df.head())

                                                news      type
0  b'US consumer confidence up\n\nConsumers\' con...  business
1  b'The \'ticking budget\' facing the US\n\nThe ...  business
2  b"Mitsubishi in Peugeot link talks\n\nTrouble-...  business
3  b'BMW reveals new models pipeline\n\nBMW is pr...  business
4  b'World leaders gather to face uncertainty\n\n...  business


In [102]:
# Count of each category of articles
df.type.astype('category').value_counts()

sport            511
business         510
politics         417
tech             401
entertainment    386
Name: type, dtype: int64

In [103]:
#Convert into list
data= df.news.values.tolist()

In [104]:
data[:1]

['b\'US consumer confidence up\\n\\nConsumers\\\' confidence in the state of the US economy is at its highest for five months and they are optimistic about 2005, an influential survey says.\\n\\nThe feel-good factor among US consumers rose in December for the first time since July according to new data. The Conference Board survey of 5,000 households pointed to renewed optimism about job creation and economic growth. US retailers have reported strong sales over the past 10 days after a slow start to the crucial festive season.\\n\\nAccording to figures also released on Tuesday, sales in shopping malls in the week to 25 December were 4.3% higher than in 2003 following a last minute rush. Wal-Mart, the largest US retailer, has said its December sales are expected to be better than previously forecast because of strong post-Christmas sales.\\n\\nIt is expecting annual sales growth of between 1% and 3% for the month. Consumer confidence figures are considered a key economic indicator becau

In [105]:
#Remove new line characters by replacing them with space
data = [re.sub("\s+", ' ', sent) for sent in data]

# Remove single quotes and replace with space
data= [re.sub("\'", ' ', sent) for sent in data]

print(data[:1])

['b US consumer confidence up\\n\\nConsumers\\  confidence in the state of the US economy is at its highest for five months and they are optimistic about 2005, an influential survey says.\\n\\nThe feel-good factor among US consumers rose in December for the first time since July according to new data. The Conference Board survey of 5,000 households pointed to renewed optimism about job creation and economic growth. US retailers have reported strong sales over the past 10 days after a slow start to the crucial festive season.\\n\\nAccording to figures also released on Tuesday, sales in shopping malls in the week to 25 December were 4.3% higher than in 2003 following a last minute rush. Wal-Mart, the largest US retailer, has said its December sales are expected to be better than previously forecast because of strong post-Christmas sales.\\n\\nIt is expecting annual sales growth of between 1% and 3% for the month. Consumer confidence figures are considered a key economic indicator because

In [106]:
#Tokenising using Gensim
def sent_to_words(sentences):
  for sentence in sentences:
    yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))

data_words= list(sent_to_words(data))
print(data_words[:1])

[['us', 'consumer', 'confidence', 'up', 'nconsumers', 'confidence', 'in', 'the', 'state', 'of', 'the', 'us', 'economy', 'is', 'at', 'its', 'highest', 'for', 'five', 'months', 'and', 'they', 'are', 'optimistic', 'about', 'an', 'influential', 'survey', 'says', 'nthe', 'feel', 'good', 'factor', 'among', 'us', 'consumers', 'rose', 'in', 'december', 'for', 'the', 'first', 'time', 'since', 'july', 'according', 'to', 'new', 'data', 'the', 'conference', 'board', 'survey', 'of', 'households', 'pointed', 'to', 'renewed', 'optimism', 'about', 'job', 'creation', 'and', 'economic', 'growth', 'us', 'retailers', 'have', 'reported', 'strong', 'sales', 'over', 'the', 'past', 'days', 'after', 'slow', 'start', 'to', 'the', 'crucial', 'festive', 'season', 'naccording', 'to', 'figures', 'also', 'released', 'on', 'tuesday', 'sales', 'in', 'shopping', 'malls', 'in', 'the', 'week', 'to', 'december', 'were', 'higher', 'than', 'in', 'following', 'last', 'minute', 'rush', 'wal', 'mart', 'the', 'largest', 'us', '

In [107]:
#Bigram and Trigram models
#bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100)
#trigram = gensim.models.Phrases(bigram[data_words], threshold=100)
#bigram_mod = gensim.models.phrases.Phraser(bigram)
#trigram_mod = gensim.models.phrases.Phraser(trigram)
#print(trigram_mod[bigram_mod[data_words[0]]])

In [108]:
#Remove stop words
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

In [109]:
#Bigram model
def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

In [110]:
#Trigram model
def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

In [111]:
#Lemmatization of words
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [112]:
#Function call for stop words
data_words_nostops = remove_stopwords(data_words)

In [113]:
#Make bigrams
#data_words_bigrams = make_bigrams(data_words_nostops)

In [114]:
#Load SpaCy Library
nlp = spacy.load('en', disable=['parser', 'ner'])

In [115]:
#data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])
data_lemmatized = lemmatization(data_words_nostops, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

In [116]:
print(data_lemmatized[:1])

[['consumer', 'confidence', 'nconsumer', 'confidence', 'state', 'economy', 'high', 'month', 'optimistic', 'influential', 'survey', 'say', 'nthe', 'feel', 'good', 'factor', 'consumer', 'rise', 'first', 'time', 'accord', 'new', 'survey', 'household', 'point', 'renew', 'optimism', 'job', 'creation', 'economic', 'growth', 'retailer', 'report', 'strong', 'sale', 'past', 'day', 'slow', 'start', 'crucial', 'festive', 'season', 'naccording', 'figure', 'also', 'release', 'sale', 'shopping', 'mall', 'week', 'higher', 'follow', 'last', 'minute', 'rush', 'large', 'retailer', 'say', 'sale', 'expect', 'better', 'previously', 'forecast', 'strong', 'post', 'sale', 'nit', 'expect', 'annual', 'sale', 'growth', 'month', 'consumer', 'confidence', 'figure', 'consider', 'key', 'economic', 'indicator', 'consumer', 'spending', 'account', 'third', 'economic', 'activity', 'continue', 'economic', 'expansion', 'combine', 'job', 'growth', 'consumer', 'end', 'year', 'high', 'note', 'say', 'suggest', 'economy', 'con

In [117]:
#Create Dictionary for input into LDA
id2word= corpora.Dictionary(data_lemmatized)
#Create Corpus
texts= data_lemmatized
#TDF
corpus= [id2word.doc2bow(text) for text in texts]

In [118]:
print(corpus[:1])

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 3), (8, 1), (9, 5), (10, 2), (11, 1), (12, 1), (13, 1), (14, 4), (15, 3), (16, 1), (17, 1), (18, 1), (19, 2), (20, 1), (21, 1), (22, 1), (23, 1), (24, 2), (25, 2), (26, 1), (27, 1), (28, 1), (29, 3), (30, 2), (31, 1), (32, 1), (33, 1), (34, 1), (35, 1), (36, 1), (37, 2), (38, 1), (39, 1), (40, 1), (41, 1), (42, 1), (43, 3), (44, 1), (45, 1), (46, 1), (47, 1), (48, 1), (49, 1), (50, 1), (51, 1), (52, 1), (53, 1), (54, 1), (55, 1), (56, 1), (57, 1), (58, 1), (59, 1), (60, 1), (61, 1), (62, 1), (63, 1), (64, 1), (65, 1), (66, 2), (67, 1), (68, 1), (69, 5), (70, 3), (71, 1), (72, 1), (73, 1), (74, 1), (75, 1), (76, 1), (77, 2), (78, 1), (79, 1), (80, 2), (81, 1), (82, 2), (83, 1), (84, 2)]]


In [119]:
#Printing in human readable form for the first document
print([[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]])

[[('accord', 1), ('account', 1), ('activity', 1), ('also', 1), ('annual', 1), ('better', 1), ('combine', 1), ('confidence', 3), ('consider', 1), ('consumer', 5), ('continue', 2), ('creation', 1), ('crucial', 1), ('day', 1), ('economic', 4), ('economy', 3), ('end', 1), ('expand', 1), ('expansion', 1), ('expect', 2), ('factor', 1), ('federal', 1), ('feel', 1), ('festive', 1), ('figure', 2), ('first', 2), ('follow', 1), ('forecast', 1), ('good', 1), ('growth', 3), ('high', 2), ('higher', 1), ('household', 1), ('increase', 1), ('indicator', 1), ('influential', 1), ('interest', 1), ('job', 2), ('key', 1), ('large', 1), ('last', 1), ('mall', 1), ('minute', 1), ('month', 3), ('naccording', 1), ('nconsumer', 1), ('new', 1), ('next', 1), ('nit', 1), ('note', 1), ('nthe', 1), ('optimism', 1), ('optimistic', 1), ('overall', 1), ('past', 1), ('perform', 1), ('point', 1), ('post', 1), ('previously', 1), ('prompt', 1), ('rate', 1), ('recent', 1), ('release', 1), ('renew', 1), ('report', 1), ('reserv

In [120]:
#Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=5, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [121]:
print(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0, '0.016*"say" + 0.016*"technology" + 0.015*"mobile" + 0.013*"phone" + 0.012*"user" + 0.011*"service" + 0.010*"firm" + 0.010*"computer" + 0.010*"music" + 0.009*"network"'), (1, '0.040*"game" + 0.020*"player" + 0.018*"play" + 0.015*"year" + 0.011*"good" + 0.009*"first" + 0.009*"time" + 0.009*"take" + 0.009*"go" + 0.009*"win"'), (2, '0.023*"say" + 0.021*"people" + 0.013*"make" + 0.010*"get" + 0.010*"work" + 0.010*"could" + 0.009*"go" + 0.008*"way" + 0.008*"would" + 0.007*"use"'), (3, '0.016*"system" + 0.011*"security" + 0.011*"test" + 0.010*"domain" + 0.010*"machine" + 0.009*"could" + 0.009*"site" + 0.008*"chip" + 0.008*"card" + 0.007*"computer"'), (4, '0.045*"say" + 0.016*"would" + 0.010*"mail" + 0.009*"search" + 0.007*"could" + 0.006*"tell" + 0.006*"make" + 0.006*"campaign" + 0.006*"also" + 0.006*"government"')]


In [122]:
#Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))


Perplexity:  -7.8419021401859315


In [123]:
#Coherence score computation
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Coherence Score:  0.390331156233603



> Coherence score with Gensim LDA is 0.39




In [124]:
#Install dependencies for MALLET
import os
def install_java_jdk():
  !apt-get install -y openjdk-8-jdk-headless -qq > /dev/null
  os.environ["JAVA_HOME"]="/usr/lib/jvm/java-8-openjdk-amd64"
  !java -version

install_java_jdk()

openjdk version "11.0.10" 2021-01-19
OpenJDK Runtime Environment (build 11.0.10+9-Ubuntu-0ubuntu1.18.04)
OpenJDK 64-Bit Server VM (build 11.0.10+9-Ubuntu-0ubuntu1.18.04, mixed mode, sharing)


In [125]:
#Download MALLET package
!wget http://mallet.cs.umass.edu/dist/mallet-2.0.8.zip
!unzip mallet-2.0.8.zip

--2021-04-24 19:53:57--  http://mallet.cs.umass.edu/dist/mallet-2.0.8.zip
Resolving mallet.cs.umass.edu (mallet.cs.umass.edu)... 128.119.246.70
Connecting to mallet.cs.umass.edu (mallet.cs.umass.edu)|128.119.246.70|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 16184794 (15M) [application/zip]
mallet-2.0.8.zip: Operation not supported

Cannot write to ‘mallet-2.0.8.zip’ (Operation not supported).
unzip:  cannot find or open mallet-2.0.8.zip, mallet-2.0.8.zip.zip or mallet-2.0.8.zip.ZIP.


In [126]:
from google.colab import drive
drive.mount('/content/gdrive')
%cd gdrive

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
[Errno 2] No such file or directory: 'gdrive'
/content/gdrive


In [127]:
#Set up MALLET environment
os.environ['MALLET_HOME']='/content/mallet-2.0.8'
mallet_path='/content/mallet-2.0.8/bin/mallet'

In [134]:
#LDA mallet function
ldamallet = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=5, id2word=id2word)

In [135]:
#Print the topics
pprint(ldamallet.show_topics(formatted=False))

[(0,
  [('year', 0.02232067652707119),
   ('company', 0.012044742427815775),
   ('market', 0.011690980762103703),
   ('sale', 0.011573060206866345),
   ('month', 0.011320373302786294),
   ('rise', 0.0111856069539436),
   ('firm', 0.009517873387015261),
   ('expect', 0.00896196219803915),
   ('increase', 0.008776658468380445),
   ('price', 0.008692429500353762)]),
 (1,
  [('government', 0.016062968057465994),
   ('plan', 0.012165673238575578),
   ('election', 0.010927709001986856),
   ('people', 0.009674461256304447),
   ('issue', 0.008253094910591471),
   ('claim', 0.007993275255998777),
   ('public', 0.007932141219624026),
   ('party', 0.007183249274033318),
   ('labour', 0.006938713128534312),
   ('make', 0.0064496408375362985)]),
 (2,
  [('people', 0.02046364967261378),
   ('make', 0.010312263710806159),
   ('service', 0.01000659598770894),
   ('technology', 0.009636577165012308),
   ('phone', 0.009491787190913625),
   ('mobile', 0.009009153943918017),
   ('user', 0.0076095175276307

In [136]:
#Calculate coherence score with Mallet
coherence_model_ldamallet = CoherenceModel(model=ldamallet, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_ldamallet = coherence_model_ldamallet.get_coherence()
print('\nCoherence Score: ', coherence_ldamallet)


Coherence Score:  0.5010935986602464


Coherence score with MALLET Gensim is higher 0.501