In [105]:
import numpy as np
import pandas as pd
import pickle
import os

import gensim
from gensim.matutils import hellinger
from gensim.models import ldaseqmodel, ldamodel, wrappers
from gensim.test.utils import datapath
from gensim.models.coherencemodel import CoherenceModel

import nltk
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords

In [101]:
# stemming stop words
def semmatize_stop_words(w):
    ps = PorterStemmer()
    return ps.stem(w)

# extend stop words to include stems
def get_stop_words():
    stop_words = stopwords.words('english')
    stop_words.extend(['from', 'subject', 're' , 'use', 'rt','thing','way','year','people','time'])
    stop_words_stem = [semmatize_stop_words(x) for x in stop_words]
    stop_words.extend(stop_words_stem)
    stop_words = list(dict.fromkeys(stop_words))

    return stop_words

# stemming the text
def semmatize_text(text):
    ps = PorterStemmer() 
    return [ps.stem(w)  for w in text if len(w)>3]

def tokenize_text(doc):
    return doc.apply(lambda row: nltk.word_tokenize(row['contents']), axis=1)

def remove_stopwords(texts, stop_words):
    return [ word for word in texts if word not in stop_words ]

def process_lda_format(doc):
    tokenized_df = tokenize_text(doc)
    stemmed_dataset = tokenized_df.apply(semmatize_text)
    stop_words = get_stop_words()
    stemmed_dataset = stemmed_dataset.map(lambda x: remove_stopwords(x, stop_words))
    return stemmed_dataset

def initialize_corpus_and_dictionary(stemmed_dataset):
    
    dictionary_of_words = gensim.corpora.Dictionary(stemmed_dataset)
    word_corpus = [dictionary_of_words.doc2bow(word) for word in stemmed_dataset]
    
    return word_corpus, dictionary_of_words

def lda_datasets(doc):
    stemmed_dataset = process_lda_format(doc)
    corpus, dictionary = initialize_corpus_and_dictionary(stemmed_dataset)
    
    return stemmed_dataset, corpus, dictionary

# run seq lda
def run_seq_lda(time_slice, corpus, dictionary, num_topics, file_ender, initialize, sstats):
    ldaseq = ldaseqmodel.LdaSeqModel(   corpus=corpus,
                                        id2word=dictionary, 
                                        time_slice=time_slice, 
                                        num_topics=num_topics, 
                                        passes=20, 
                                        initialize=initialize, 
                                        sstats=sstats)
    cwd = os.getcwd()
    temp_file = datapath(os.path.join(cwd, "LDA_models/lda_model_" + str(num_topics) + "_dtm" + file_ender))
    print('Model is saving... at', temp_file)
    ldaseq.save(temp_file)

# test coherence of each time step
def coherence_time_cal(model):
    cv_dtm = []
    umass_dtm = []
    for i in range(7):   
        topics_dtm = model.dtm_coherence(time=i)
        umass = CoherenceModel(topics=topics_dtm, corpus=corpus, dictionary=dictionary, coherence='u_mass').get_coherence()
        cv = CoherenceModel(topics=topics_dtm, texts=stemmed_dataset, dictionary=dictionary, coherence='c_v').get_coherence()
        cv_dtm.append(cv)
        umass_dtm.append(umass)
    return cv_dtm, umass_dtm

In [100]:
# load the corpus used for training the LDA model
data_nouns = pd.read_csv("Topic_modelling_data/doc_only_nouns.csv")
stop_words = get_stop_words()
stemmed_dataset, corpus, dictionary = lda_datasets(data_nouns)
docs = pd.read_csv("Topic_modelling_data/cleaned_documents.csv", lineterminator='\n', index_col = 0)

# convert date string to datetime format
docs['publish_date']=list(pd.to_datetime(docs['publish_date']))
docs['publish_date']=docs['publish_date'].dt.to_period('M')

# count number of docs each year (docs & corpus have been sorted by publish date)
time_slice_cnt = pd.Series(docs['publish_date'].map(lambda x: x.year)).value_counts().sort_index()
time_slice = time_slice_cnt.values.tolist()

In [78]:
# dtm with 10 topics with 'gensim' initialization
dtm_model = ldaseqmodel.LdaSeqModel.load("LDA_models/lda_model_10_dtm")
dtm_model.print_topic_times(topic=0)

[[('robot', 0.012071621214888546),
  ('energi', 0.008733971026419811),
  ('world', 0.008671711716155562),
  ('space', 0.007844593364427768),
  ('car', 0.007515684188614516),
  ('engin', 0.007194062443219673),
  ('system', 0.007022144927183778),
  ('technolog', 0.006499031715150603),
  ('mile', 0.006496891188153285),
  ('design', 0.006346533271317728),
  ('anim', 0.00556179557261388),
  ('someth', 0.005363069543141257),
  ('surfac', 0.00518852490868416),
  ('play', 0.005045959578334096),
  ('kind', 0.004929835352650873),
  ('climat', 0.004902731481389513),
  ('feet', 0.004788086920614842),
  ('problem', 0.004672660024909499),
  ('earth', 0.004544334140334567),
  ('fuel', 0.004532879361849345)],
 [('robot', 0.012641636282720005),
  ('energi', 0.008920483224274606),
  ('world', 0.008124162378851997),
  ('car', 0.007531019043622682),
  ('engin', 0.007422380843935664),
  ('space', 0.007371100669863122),
  ('system', 0.007061647861525047),
  ('mile', 0.006632512987485035),
  ('technolog', 0.

### DTM with the global LDA model's output as initilization

In [116]:
# use the global lda model with 10 topics as the initial stats
lda_10_global = wrappers.LdaMallet.load("LDA_models/lda_model_10_mallet")
sstats = lda_10_global.get_topics().T
pickle.dump(sstats, open("Topic_modelling_data/sstats.pkl", "wb"))
run_seq_lda(time_slice, corpus, dictionary, 10, "glocalinit", 'own', sstats)

  convergence = np.fabs((bound - old_bound) / old_bound)


Model is saving... at /Users/qyq/Desktop/Courses/2022-SS/IML_Vis/Project/Visual_Analytics/LDA_models/lda_model_10_dtmglocalinit


In [119]:
# test coherence of each time step
dtm_global = ldaseqmodel.LdaSeqModel.load("LDA_models/lda_model_10_dtmglocalinit")
cv_dtm, umass_dtm = coherence_time_cal(dtm_global)
cv_dtm
umass_dtm

[-1.0591667768745654,
 -1.0683460546663008,
 -1.0261321501470198,
 -1.0712732080902374,
 -1.1116973648912232,
 -1.146670538670545,
 -1.1186473440672997]

In [120]:
cv_dtm

[0.4009920857703658,
 0.41184239033589626,
 0.4058607268097201,
 0.41026362662016014,
 0.4147431005175964,
 0.4213206453141523,
 0.42009862107659224]

In [126]:
dtm_global.print_topic_times(topic = 0)

[[('stori', 0.02314356373052659),
  ('someth', 0.013881859121519645),
  ('film', 0.01364384910914877),
  ('pictur', 0.013154714257060785),
  ('imag', 0.012542714758841642),
  ('photograph', 0.010592430691370917),
  ('movi', 0.00891006293783841),
  ('world', 0.00801930950288065),
  ('hand', 0.007763448252868459),
  ('sort', 0.007726262148962326),
  ('book', 0.007698475042922546),
  ('face', 0.006245336022960306),
  ('mind', 0.006214939745809623),
  ('audienc', 0.005585245406068522),
  ('moment', 0.0055531206243672904),
  ('show', 0.00486030564498999),
  ('part', 0.004763124893877237),
  ('dont', 0.004718778704488555),
  ('eye', 0.004675271312239671),
  ('camera', 0.004580922527671174)],
 [('stori', 0.023621778212720077),
  ('someth', 0.01358867179612797),
  ('imag', 0.012709330360287298),
  ('pictur', 0.012221760202803505),
  ('film', 0.012146573639630145),
  ('photograph', 0.010209265518500205),
  ('movi', 0.009067911726441176),
  ('hand', 0.007974443388162517),
  ('sort', 0.0078922622

In [127]:
dtm_global.print_topics(time = 6)

[[('stori', 0.026917095097313375),
  ('imag', 0.017536834107727616),
  ('film', 0.01448382786846648),
  ('someth', 0.013592535167406522),
  ('book', 0.010364716511108707),
  ('world', 0.009125246631010464),
  ('pictur', 0.008433726548312903),
  ('movi', 0.0081776950092493),
  ('face', 0.007332163455308627),
  ('sort', 0.0065878069281813225),
  ('hand', 0.006525656826048474),
  ('mind', 0.006449598447303874),
  ('photo', 0.006396398971440471),
  ('memori', 0.006045214511123389),
  ('moment', 0.005882271337665207),
  ('photograph', 0.0058149733012076805),
  ('part', 0.005290007377122242),
  ('camera', 0.004998970573877472),
  ('eye', 0.0049768275403046045),
  ('show', 0.004721615311272516)],
 [('design', 0.026222643629991663),
  ('music', 0.023364725384241537),
  ('space', 0.013877883985016043),
  ('build', 0.013826568549401787),
  ('idea', 0.01323675219753181),
  ('kind', 0.012375173122001917),
  ('piec', 0.012367027821072244),
  ('someth', 0.012027351887649453),
  ('work', 0.0118955586