# Import Libraries

In [1]:
#!pip install gensim
#!pip install pyLDAvis

In [2]:
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
import logging
import gensim 
from gensim import corpora
from gensim.models.coherencemodel import CoherenceModel
#import pyLDAvis.gensim
import pickle 
#import pyLDAvis
from operator import itemgetter
from gensim.test.utils import datapath
import warnings
warnings.filterwarnings("ignore")
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

# Read Data

In [3]:
news=pd.read_table('MovieStories.utf8.txt',header=0)
news

Unnamed: 0,MovieID,CategoryGross,Storyline
0,1,3,An action-packed drama about a Christian high ...
1,2,4,This holiday comedy is centered around two nei...
2,3,1,The day after they get the word they'll go hom...
3,4,2,"This is the story of Doogal, an adorable candy..."
4,5,5,A drama that focuses on the period in Mary and...
...,...,...,...
1048,1049,5,A futuristic twist on Robert Louis Stevenson's...
1049,1050,9,Ten years after the 'Phantom Menace' threatene...
1050,1051,9,"For Agent J, it is another day at the office, ..."
1051,1052,4,Frida chronicles the life Frida Kahlo shared u...


# Preprocessing

In [4]:
mystopwords=stopwords.words("english") + ['life','film','movie','one','two']
WNlemma = nltk.WordNetLemmatizer()

def pre_process(text):
    tokens = nltk.word_tokenize(text)
    tokens=[ WNlemma.lemmatize(t.lower()) for t in tokens]
    tokens=[ t for t in tokens if t not in mystopwords]
    tokens = [ t for t in tokens if len(t) >= 3 ]
    return(tokens)

In [5]:
toks = news['Storyline'].apply(pre_process)
toks

0       [action-packed, drama, christian, high, school...
1       [holiday, comedy, centered, around, neighbor, ...
2       [day, get, word, 'll, home, week, group, soldi...
3       [story, doogal, adorable, candy-loving, mutt, ...
4       [drama, focus, period, mary, joseph, journeyed...
                              ...                        
1048    [futuristic, twist, robert, louis, stevenson, ...
1049    [ten, year, 'phantom, menace, threatened, plan...
1050    [agent, another, day, office, monitoring, lice...
1051    [frida, chronicle, frida, kahlo, shared, unfli...
1052    [story, detective, agustin, rejas, man, clingi...
Name: Storyline, Length: 1053, dtype: object

In [6]:
# Use dictionary (built from corpus) to prepare a Document Term Matrix DTM (using frequency)
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
dictionary = corpora.Dictionary(toks)
print(dictionary)

# Filter off any words with document frequency less than 2, or appearing in more than 80% documents
dictionary.filter_extremes(no_below=2, no_above=0.8)
print(dictionary)

2022-01-07 08:18:04,237 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2022-01-07 08:18:04,320 : INFO : built Dictionary(12151 unique tokens: ['action-packed', 'battle', 'believe', 'brooke', 'challenge']...) from 1053 documents (total 54912 corpus positions)
2022-01-07 08:18:04,341 : INFO : discarding 6862 tokens: [('cry', 1), ('dismissed', 1), ('infertility', 1), ('organizing', 1), ('shiloh', 1), ('transpires', 1), ('undying', 1), ('decorate', 1), ('visible', 1), ('dislocation', 1)]...
2022-01-07 08:18:04,346 : INFO : keeping 5289 tokens which were in no less than 2 and no more than 842 (=80.0%) documents
2022-01-07 08:18:04,356 : INFO : resulting dictionary: Dictionary(5289 unique tokens: ['action-packed', 'battle', 'believe', 'brooke', 'challenge']...)


Dictionary(12151 unique tokens: ['action-packed', 'battle', 'believe', 'brooke', 'challenge']...)
Dictionary(5289 unique tokens: ['action-packed', 'battle', 'believe', 'brooke', 'challenge']...)


In [7]:
# dtm here is a list of lists, which is exactly a matrix
# stored like this due to sparcity
dtm = [dictionary.doc2bow(d) for d in toks]
dtm

[[(0, 1),
  (1, 1),
  (2, 1),
  (3, 1),
  (4, 1),
  (5, 1),
  (6, 1),
  (7, 2),
  (8, 1),
  (9, 1),
  (10, 1),
  (11, 1),
  (12, 1),
  (13, 1),
  (14, 1),
  (15, 1),
  (16, 2),
  (17, 1),
  (18, 1),
  (19, 1),
  (20, 2),
  (21, 1),
  (22, 2),
  (23, 1),
  (24, 1),
  (25, 2),
  (26, 1),
  (27, 2),
  (28, 3),
  (29, 1),
  (30, 1),
  (31, 1),
  (32, 1),
  (33, 1),
  (34, 1),
  (35, 1),
  (36, 1),
  (37, 1),
  (38, 1),
  (39, 1),
  (40, 1),
  (41, 1),
  (42, 1),
  (43, 1),
  (44, 1),
  (45, 1),
  (46, 1),
  (47, 1),
  (48, 1),
  (49, 1),
  (50, 1),
  (51, 1),
  (52, 1),
  (53, 1),
  (54, 1),
  (55, 1),
  (56, 1),
  (57, 1),
  (58, 1),
  (59, 1),
  (60, 1)],
 [(61, 1),
  (62, 1),
  (63, 2),
  (64, 1),
  (65, 1),
  (66, 1),
  (67, 1),
  (68, 1),
  (69, 1),
  (70, 1),
  (71, 1),
  (72, 1),
  (73, 1),
  (74, 1),
  (75, 1),
  (76, 1),
  (77, 1),
  (78, 1),
  (79, 1),
  (80, 1),
  (81, 1),
  (82, 1),
  (83, 1),
  (84, 1)],
 [(30, 2),
  (68, 1),
  (84, 1),
  (85, 1),
  (86, 1),
  (87, 1),
  (88, 

# Train LDA model

In [8]:
lda = gensim.models.ldamodel.LdaModel(dtm, num_topics = 5, id2word = dictionary, passes=10,chunksize=128,random_state=10)
lda.show_topics(10)

2022-01-07 08:18:05,842 : INFO : using symmetric alpha at 0.2
2022-01-07 08:18:05,844 : INFO : using symmetric eta at 0.2
2022-01-07 08:18:05,848 : INFO : using serial LDA version on this node
2022-01-07 08:18:05,856 : INFO : running online (multi-pass) LDA training, 5 topics, 10 passes over the supplied corpus of 1053 documents, updating model once every 128 documents, evaluating perplexity every 1053 documents, iterating 50x with a convergence threshold of 0.001000
2022-01-07 08:18:05,858 : INFO : PROGRESS: pass 0, at document #128/1053
2022-01-07 08:18:06,018 : INFO : merging changes from 128 documents into a model of 1053 documents
2022-01-07 08:18:06,029 : INFO : topic #0 (0.200): 0.010*"find" + 0.007*"new" + 0.006*"house" + 0.006*"game" + 0.006*"friend" + 0.005*"family" + 0.005*"come" + 0.005*"american" + 0.004*"n't" + 0.004*"home"
2022-01-07 08:18:06,031 : INFO : topic #1 (0.200): 0.008*"friend" + 0.007*"find" + 0.006*"day" + 0.005*"home" + 0.005*"get" + 0.005*"love" + 0.005*"wa

[(0,
  '0.010*"set" + 0.008*"agent" + 0.006*"force" + 0.005*"army" + 0.005*"escape" + 0.005*"must" + 0.004*"named" + 0.004*"battle" + 0.004*"prove" + 0.004*"crew"'),
 (1,
  '0.011*"family" + 0.010*"new" + 0.009*"find" + 0.009*"get" + 0.009*"day" + 0.009*"friend" + 0.008*"n\'t" + 0.008*"come" + 0.007*"home" + 0.007*"love"'),
 (2,
  '0.012*"world" + 0.010*"vampire" + 0.008*"story" + 0.007*"american" + 0.007*"america" + 0.006*"love" + 0.006*"sam" + 0.006*"young" + 0.006*"human" + 0.005*"situation"'),
 (3,
  '0.010*"new" + 0.009*"school" + 0.008*"get" + 0.008*"friend" + 0.007*"going" + 0.007*"want" + 0.006*"high" + 0.005*"drug" + 0.005*"york" + 0.005*"..."'),
 (4,
  '0.007*"man" + 0.006*"find" + 0.006*"daughter" + 0.006*"wife" + 0.006*"team" + 0.006*"father" + 0.005*"year" + 0.005*"child" + 0.005*"death" + 0.005*"family"')]

# Evaluate the coherence score of LDA model

* u_mass:prefer the model close to 0 
* c_v: [0,1], prefer bigger value   
* Note - Do not fully rely on the coherence score


In [9]:
umass = CoherenceModel(lda,  dictionary=dictionary, corpus=dtm, coherence='u_mass')
umass.get_coherence()

2022-01-07 08:18:13,339 : INFO : CorpusAccumulator accumulated stats from 1000 documents


-4.58239006853846

In [10]:
cv = CoherenceModel(lda,  dictionary=dictionary, texts=toks, coherence='c_v')
cv.get_coherence()

2022-01-07 08:18:13,368 : INFO : using WordOccurrenceAccumulator to estimate probabilities from sliding windows
2022-01-07 08:18:13,453 : INFO : WordOccurrenceAccumulator accumulated stats from 1000 documents


0.29174151769917456

# Visualize the topics

In [11]:
# pyLDAvis.enable_notebook()
# LDAvis_prepared = pyLDAvis.gensim.prepare(lda, dtm, dictionary)
# pyLDAvis.show(LDAvis_prepared)

# Study model results

In [12]:
# Get the topic distribution of documents
doc_topics = lda.get_document_topics(dtm)


# show the topic distributions for the first 5 docs, 
for i in range(0, 5):
  print("Document",i)

  print('*'*10)

  print("Topics:",doc_topics[i])

  print("Dominating Topic:",max(doc_topics[i], key=itemgetter(1))[0])

  print('-'*50) 

Document 0
**********
Topics: [(1, 0.25944486), (4, 0.7319102)]
Dominating Topic: 4
--------------------------------------------------
Document 1
**********
Topics: [(0, 0.41030166), (1, 0.20884141), (3, 0.27022108), (4, 0.102815986)]
Dominating Topic: 0
--------------------------------------------------
Document 2
**********
Topics: [(0, 0.11044026), (1, 0.53440255), (3, 0.08784587), (4, 0.26398966)]
Dominating Topic: 1
--------------------------------------------------
Document 3
**********
Topics: [(0, 0.93421537), (2, 0.053893603)]
Dominating Topic: 0
--------------------------------------------------
Document 4
**********
Topics: [(0, 0.8989781), (1, 0.025124812), (2, 0.025722532), (3, 0.025066137), (4, 0.025108416)]
Dominating Topic: 0
--------------------------------------------------


In [14]:
# show the topic distributions
for j in range(5):
  print('Topic:',j)
  print('*'*10)
  c=0
  for i in range(0, news.shape[0]):
    x = max(doc_topics[i], key=itemgetter(1))[0]
    if(x==j):
      print("Document",i,"-",news.iloc[i,-1])
      c=c+1
    if(c==5):
      break
  print('-'*50) 

Topic: 0
**********
Document 1 - This holiday comedy is centered around two neighbors in a small New England town who go to war when one of them decides to decorate his house with a so many Christmas lights that they are visible from space. The neighborhood is turned upside down as the families try to discover the true meaning of Christmas. 
Document 3 - This is the story of Doogal, an adorable candy-loving mutt who goes on a mission to save the world. Doogal must prevent the evil sorcerer Zeebad from freezing the earth forever with the power of the three mysterious legendary diamonds. Joining Doogal on his big quest are pals Dylan, a guitar-playing rabbit, Ermintrude, an opera-singing cow, and Brian, a bashful snail. Hopping on a magic train, they travel over ice-capped mountains, navigate fiery pits of lava, and sail across vast oceans on the perilous journey of a lifetime. Along the way, they learn that the most powerful weapon of all is their friendship - which even Zeebad's magic 

In [15]:
# Select the best topic (with highest score) for each document
top_topic = [max(t, key=itemgetter(1))[0] for t in doc_topics]
print(top_topic)

[4, 0, 1, 0, 0, 1, 4, 3, 3, 0, 0, 3, 1, 4, 1, 4, 2, 0, 4, 0, 3, 1, 0, 1, 4, 2, 1, 1, 0, 3, 2, 3, 1, 2, 4, 4, 1, 3, 3, 1, 3, 0, 1, 2, 0, 1, 4, 3, 1, 3, 4, 0, 3, 1, 3, 2, 3, 0, 0, 3, 3, 3, 4, 3, 1, 4, 2, 3, 0, 3, 3, 0, 2, 1, 3, 4, 1, 4, 4, 4, 1, 4, 1, 4, 4, 1, 4, 1, 4, 1, 4, 2, 2, 1, 4, 4, 1, 3, 1, 4, 4, 1, 4, 4, 2, 0, 1, 3, 3, 2, 3, 2, 3, 4, 2, 4, 4, 0, 1, 3, 1, 4, 2, 3, 1, 3, 4, 3, 1, 0, 1, 0, 3, 1, 2, 1, 3, 2, 0, 1, 3, 1, 1, 2, 4, 2, 1, 2, 3, 1, 4, 0, 4, 3, 4, 3, 2, 3, 1, 0, 4, 4, 2, 1, 3, 1, 3, 1, 1, 1, 3, 1, 2, 0, 4, 2, 3, 4, 0, 3, 0, 0, 1, 0, 1, 3, 2, 4, 1, 4, 3, 2, 1, 1, 4, 4, 4, 3, 1, 2, 3, 4, 3, 0, 2, 1, 0, 0, 2, 0, 3, 4, 1, 1, 3, 1, 4, 1, 4, 4, 3, 1, 4, 0, 4, 4, 2, 3, 4, 3, 2, 0, 1, 4, 2, 2, 4, 3, 0, 0, 4, 1, 0, 3, 4, 1, 2, 2, 3, 4, 1, 3, 0, 0, 4, 3, 1, 4, 3, 1, 1, 2, 3, 3, 1, 3, 2, 4, 4, 3, 3, 4, 4, 1, 4, 1, 2, 2, 3, 4, 3, 1, 0, 3, 3, 0, 3, 1, 0, 3, 4, 4, 4, 4, 1, 4, 3, 3, 1, 2, 3, 1, 3, 2, 1, 1, 1, 1, 2, 3, 4, 4, 4, 3, 1, 0, 1, 3, 1, 4, 4, 3, 4, 1, 0, 3, 3, 3, 3, 4, 1, 3, 1, 

In [16]:
# Label the topics based on representing "topic_words"
dict = {0: 'Love', 1: 'school', 2: 'young_crime', 3: 'politic', 4:'war'}
topics_perDoc = [ dict[t] for t in top_topic ]
print (topics_perDoc)

['war', 'Love', 'school', 'Love', 'Love', 'school', 'war', 'politic', 'politic', 'Love', 'Love', 'politic', 'school', 'war', 'school', 'war', 'young_crime', 'Love', 'war', 'Love', 'politic', 'school', 'Love', 'school', 'war', 'young_crime', 'school', 'school', 'Love', 'politic', 'young_crime', 'politic', 'school', 'young_crime', 'war', 'war', 'school', 'politic', 'politic', 'school', 'politic', 'Love', 'school', 'young_crime', 'Love', 'school', 'war', 'politic', 'school', 'politic', 'war', 'Love', 'politic', 'school', 'politic', 'young_crime', 'politic', 'Love', 'Love', 'politic', 'politic', 'politic', 'war', 'politic', 'school', 'war', 'young_crime', 'politic', 'Love', 'politic', 'politic', 'Love', 'young_crime', 'school', 'politic', 'war', 'school', 'war', 'war', 'war', 'school', 'war', 'school', 'war', 'war', 'school', 'war', 'school', 'war', 'school', 'war', 'young_crime', 'young_crime', 'school', 'war', 'war', 'school', 'politic', 'school', 'war', 'war', 'school', 'war', 'war', 'y

In [17]:
# No. of docs in each topic
labels, counts = np.unique(topics_perDoc, return_counts=True)
print(labels)
print(counts)

['Love' 'politic' 'school' 'war' 'young_crime']
[120 245 300 278 110]


# Save LDA model

In [18]:
# Save model to disk.
temp_file = datapath("LDA_model")
lda.save(temp_file)

2022-01-07 08:21:21,019 : INFO : saving LdaState object under /usr/local/lib/python3.7/dist-packages/gensim/test/test_data/LDA_model.state, separately None
2022-01-07 08:21:21,026 : INFO : saved /usr/local/lib/python3.7/dist-packages/gensim/test/test_data/LDA_model.state
2022-01-07 08:21:21,035 : INFO : saving LdaModel object under /usr/local/lib/python3.7/dist-packages/gensim/test/test_data/LDA_model, separately ['expElogbeta', 'sstats']
2022-01-07 08:21:21,036 : INFO : storing np array 'expElogbeta' to /usr/local/lib/python3.7/dist-packages/gensim/test/test_data/LDA_model.expElogbeta.npy
2022-01-07 08:21:21,040 : INFO : not storing attribute state
2022-01-07 08:21:21,043 : INFO : not storing attribute dispatcher
2022-01-07 08:21:21,045 : INFO : not storing attribute id2word
2022-01-07 08:21:21,047 : INFO : saved /usr/local/lib/python3.7/dist-packages/gensim/test/test_data/LDA_model


# Inference

In [19]:
# Load pretrained model from disk.
lda = gensim.models.ldamodel.LdaModel.load(temp_file)

2022-01-07 08:21:24,900 : INFO : loading LdaModel object from /usr/local/lib/python3.7/dist-packages/gensim/test/test_data/LDA_model
2022-01-07 08:21:24,904 : INFO : loading expElogbeta from /usr/local/lib/python3.7/dist-packages/gensim/test/test_data/LDA_model.expElogbeta.npy with mmap=None
2022-01-07 08:21:24,909 : INFO : setting ignored attribute state to None
2022-01-07 08:21:24,913 : INFO : setting ignored attribute dispatcher to None
2022-01-07 08:21:24,916 : INFO : setting ignored attribute id2word to None
2022-01-07 08:21:24,917 : INFO : loaded /usr/local/lib/python3.7/dist-packages/gensim/test/test_data/LDA_model
2022-01-07 08:21:24,918 : INFO : loading LdaState object from /usr/local/lib/python3.7/dist-packages/gensim/test/test_data/LDA_model.state
2022-01-07 08:21:24,921 : INFO : loaded /usr/local/lib/python3.7/dist-packages/gensim/test/test_data/LDA_model.state


In [20]:
# Define test documents
test=[
"To save her ailing father from serving in the Imperial Army, a fearless young woman disguises herself as a man to battle northern invaders in China",
"A secret agent embarks on a dangerous, time-bending mission to prevent the start of World War III"
]

In [21]:
# Split tokens
test = [i.split() for i in test]
print(test)

[['To', 'save', 'her', 'ailing', 'father', 'from', 'serving', 'in', 'the', 'Imperial', 'Army,', 'a', 'fearless', 'young', 'woman', 'disguises', 'herself', 'as', 'a', 'man', 'to', 'battle', 'northern', 'invaders', 'in', 'China'], ['A', 'secret', 'agent', 'embarks', 'on', 'a', 'dangerous,', 'time-bending', 'mission', 'to', 'prevent', 'the', 'start', 'of', 'World', 'War', 'III']]


In [22]:
# new_dtm here is a list of lists, which is exactly a matrix
new_dtm = [dictionary.doc2bow(d) for d in test]
new_dtm

[[(1, 1),
  (21, 1),
  (115, 1),
  (133, 1),
  (173, 1),
  (234, 1),
  (2805, 1),
  (2849, 1)],
 [(158, 1), (169, 1), (272, 1), (594, 1), (923, 1), (2673, 1)]]

In [23]:
for i in range(0, 2):
  print("Document",i)

  print('*'*10)

  unseen_doc = new_dtm[i]

  print("Topics:",lda[unseen_doc])

  x = max(lda[unseen_doc], key=itemgetter(1))[0]
  
  print("Dominating Topic:",x,dict[x])

  print('-'*50) 

Document 0
**********
Topics: [(0, 0.022982622), (1, 0.02321663), (2, 0.20845246), (3, 0.022502268), (4, 0.72284603)]
Dominating Topic: 4 war
--------------------------------------------------
Document 1
**********
Topics: [(0, 0.47309446), (1, 0.440222), (2, 0.028582554), (3, 0.028722133), (4, 0.029378882)]
Dominating Topic: 0 Love
--------------------------------------------------


# Update the model by incrementally training on the new corpus


In [24]:
# lda.update(new_dtm)

# for i in range(0, 2):
#     unseen_doc = new_dtm[i]
#     print(lda[unseen_doc])