In [5]:
import sys, numpy as np, pickle
if '/home/sagemaker-user/octis/OCTIS' not in sys.path: sys.path.append('/home/sagemaker-user/octis/OCTIS') 

from octis.models.LDA import LDA
from octis.models.LDA_gibbs import LDA_gibbs
from octis.models.LDA_tomopy import LDA_tomopy
from octis.models.NMF_scikit import NMF_scikit

from octis.dataset.dataset import Dataset
from octis.evaluation_metrics.diversity_metrics import TopicDiversity
from octis.evaluation_metrics.coherence_metrics import Coherence

from octis.optimization.optimizer import Optimizer
from skopt.space.space import Real, Integer, Categorical

from warnings import filterwarnings
filterwarnings('ignore')

In [6]:
#loadData
dblp_dataset = Dataset()
dblp_dataset.fetch_dataset('DBLP')

coherence = Coherence(texts=dblp_dataset.get_corpus(), topk=10, measure='c_v')

NMF_scikit

In [None]:
nmf_model_20 = NMF_scikit(num_topics=20, init='nndsvdar', alpha=0.0)
nmf_output_20 = nmf_model_20.train_model(dblp_dataset)
print('Topics:',*[' '.join(x) for x in nmf_output_20['topics']],sep='\n')
print(f'\nOverall Coherence Score: {coherence.score(nmf_output_20)}')

In [4]:
nmf_model_100 = NMF_scikit(num_topics=100, init='nndsvdar')
nmf_output_100 = nmf_model_100.train_model(dblp_dataset)
print('Topics:',*[' '.join(x) for x in nmf_output_100['topics']],sep='\n')
print(f'\nOverall Coherence Score: {coherence.score(nmf_output_100)}')

Topics:
sin love life human hate live die death hell faith
window manager application app screen advance event user top position
game baseball night watch playoff pen hockey series devil score
model engine manual build base datum science design assume seat
phone company clipper network encryption office listen conversation voice wire
card video slot bus vga controller local graphic performance mode
drive hard floppy ide format controller meg tape boot internal
key encryption escrow public algorithm security encrypt clipper secure secret
file format directory convert download utility site create command edit
car engine mile owner dealer tire auto brake insurance oil
state jewish israeli arab country peace war land live attack
team season playoff hockey expansion move star division final record
religion belief claim atheist true evidence exist argument truth atheism
image format convert processing datum graphic screen object quality resolution
system user moral boot objective base access

LDA_gibbs

In [5]:
lda_model_20 = LDA_gibbs(num_topics=20, n_iter=1000)
lda_output_20 = lda_model_20.train_model(dblp_dataset)
print('Topics:',*[' '.join(x) for x in lda_output_20['topics']],sep='\n')
print(f'\nOverall Coherence Score: {coherence.score(lda_output_20)}')

Topics:
drive card disk system driver work problem scsi monitor hard
man church word people love sin life christian make homosexual
car price sell good buy sale offer pay make engine
key chip encryption phone clipper government system bit public privacy
space launch system earth mission orbit satellite year planet station
book write read article copy issue text paper find author
people thing time start happen make child day hear year
make people work time talk year president question money give
system software include support user version base server run computer
armenian people war government turkish jewish israeli kill country attack
back time leave turn line work start day door side
post mail send list address information group email question message
image file color bit format graphic datum program display version
power problem thing good make work time wire ground high
file window program run set font entry application problem line
study year number patient drug report high resear

In [6]:
lda_model_100 = LDA_gibbs(num_topics=100, n_iter=1000)
lda_output_100 = lda_model_100.train_model(dblp_dataset)
print('Topics:',*[' '.join(x) for x in lda_output_100['topics']],sep='\n')
print(f'\nOverall Coherence Score: {coherence.score(lda_output_100)}')

LDA_tomotopy

In [3]:
ldaTomopy_model_20 = LDA_tomopy(num_topics=20, max_iters=100)
ldaTomopy_output_20 = ldaTomopy_model_20.train_model(dblp_dataset)

print('Topics:',*[' '.join(x) for x in ldaTomopy_output_20['topics']],sep='\n')
print(f'\nOverall Coherence Score: {coherence.score(ldaTomopy_output_20)}')

Topics:
make point people question thing claim good give read true
armenian people kill turkish war government attack israeli jewish village
system computer program technology provide design project information develop user
file image program version server graphic format color display software
space launch mission satellite earth orbit planet year shuttle solar
people make time good work thing talk question job problem
fire child batf agent police tank compound report warrant start
game team year play win player good season score hit
church word man book sin love life faith child christian
price sell good offer sale mail include buy interested book
law government gun state crime weapon control criminal public firearm
religion people belief man atheist human homosexual moral religious atheism
mail post list send information address group message article email
key chip encryption clipper phone bit system number security algorithm
people time start back day happen year leave thing hear
y

In [5]:
ldaTomopy_model_100 = LDA_tomopy(num_topics=100, max_iters=100)
ldaTomopy_output_100 = ldaTomopy_model_100.train_model(dblp_dataset)

print('Topics:',*[' '.join(x) for x in ldaTomopy_output_100['topics']],sep='\n')
print(f'\nOverall Coherence Score: {coherence.score(ldaTomopy_output_100)}')

Topics:
law rule entry court make case legal note judge require
man son father mother speak make child hell death pray
observation science scientist experiment satellite activity main solar result field
child homosexual male parent man sexual adult sex woman homosexuality
goal period lead shot play penalty save score power line
privacy technology computer encryption product communication system security government standard
font printer print window screen character page laser driver size
question answer text case die death claim act record give
claim evidence show point find support case make account place
human land arab live peace life jewish part territory give
light vote theory good physical universe material matter include hole
mail send message address internet information email network user request
people state make person idea medium issue create business support
term science idea sense part view scientific knowledge base group
time happen thing make bring talk bad win turn peo

HPOptimization

In [7]:
#NMF_scikit
nmf_model = NMF_scikit()
search_space = {"num_topics": Categorical([20,50,70,100]), "init": Categorical(['nndsvda','nndsvdar']), "alpha": Categorical([0.0,0.25,0.5,0.75,1.0])}
optimizer=Optimizer()

optResult=optimizer.optimize(nmf_model, dblp_dataset, coherence, search_space, save_path="./outputFiles", number_of_call=30, model_runs=5)
optResult.save_to_csv("NMFscikit_HOCoherence_dblpResults.csv")

  0%|          | 0/30 [00:00<?, ?it/s]

In [None]:
#LDA_gibbs
lda_model = LDA_gibbs()
search_space = {"n_topics": Categorical([20,50,70,100]), "n_iter": Categorical([100]), "alpha": Categorical([0.01,0.05,0.1,0.5,1.0]), "eta": Categorical([0.001,0.005,0.01,0.05,0.1])}
optimizer=Optimizer()

optResult=optimizer.optimize(lda_model, dblp_dataset, coherence, search_space, save_path="./outputFiles", number_of_call=30, model_runs=5)
optResult.save_to_csv("LDAgibbs_HOCoherence_dblpResults.csv")

In [9]:
#LDA_tomopy
ldatomopy_model = LDA_tomopy()
search_space = {"num_topics": Categorical([20,50,70,100]), "max_iters": Categorical([100]), "alpha": Categorical([0.01,0.05,0.1,0.5,1.0]), "eta": Categorical([0.001,0.005,0.01,0.05,0.1])}
optimizer=Optimizer()

optResult=optimizer.optimize(ldatomopy_model, dblp_dataset, coherence, search_space, save_path="./outputFiles", number_of_call=30, model_runs=5)
optResult.save_to_csv("LDAtomopy_HOCoherence_dblpResults.csv")

  0%|          | 0/30 [00:00<?, ?it/s]

INFO:gensim.topic_coherence.probability_estimation:using WordOccurrenceAccumulator to estimate probabilities from sliding windows
INFO:gensim.topic_coherence.text_analysis:WordOccurrenceAccumulator accumulated stats from 1000 documents
INFO:gensim.topic_coherence.text_analysis:WordOccurrenceAccumulator accumulated stats from 2000 documents
INFO:gensim.topic_coherence.text_analysis:WordOccurrenceAccumulator accumulated stats from 3000 documents
INFO:gensim.topic_coherence.text_analysis:WordOccurrenceAccumulator accumulated stats from 4000 documents
INFO:gensim.topic_coherence.text_analysis:WordOccurrenceAccumulator accumulated stats from 5000 documents
INFO:gensim.topic_coherence.text_analysis:WordOccurrenceAccumulator accumulated stats from 6000 documents
INFO:gensim.topic_coherence.text_analysis:WordOccurrenceAccumulator accumulated stats from 7000 documents
INFO:gensim.topic_coherence.text_analysis:WordOccurrenceAccumulator accumulated stats from 8000 documents
INFO:gensim.topic_cohe

In [12]:
#ProcessingDatasets - Sorting
import pandas as pd
df=pd.read_csv('/home/sagemaker-user/octis/LDAgibbs_HOCoherence_dblpResults.csv')
df.sort_values(by='Median(model_runs)',inplace=True, ascending=False)
df.to_csv('/home/sagemaker-user/octis/LDAgibbs_HOCoherence_dblpResults.csv')