In [1]:
import sys
import pandas as pd
if 'E:\\Rashi\\octis\\OCTIS' not in sys.path: sys.path.append('E:\\Rashi\\octis\\OCTIS') 

from octis.models.LDA import LDA
from octis.models.LDA_gibbs import LDA_gibbs
from octis.models.LDA_tomopy import LDA_tomopy
from octis.models.NMF_scikit import NMF_scikit

from octis.dataset.dataset import Dataset
from octis.evaluation_metrics.diversity_metrics import KLDivergence
from octis.evaluation_metrics.coherence_metrics import Coherence
from octis.evaluation_metrics.perplexity_metrics import Perplexity

from octis.optimization.optimizer import Optimizer
from skopt.space.space import Real, Integer, Categorical

from warnings import filterwarnings
filterwarnings('ignore')

In [4]:
#loadCustomData
import os, numpy as np, pandas as pd

fut=pd.read_csv("E:\\Rashi\\datasets\\articles_futurism_year\\articles_futurism_2022.csv")
fut.drop_duplicates(inplace=True,ignore_index=True)
docs=[doc.replace('\n',' ')+' \n' for doc in fut.content.to_list()]

try:
    os.remove('E:\\Rashi\\octis\\articles_futurism_2022_content.txt')
except OSError:
    pass

with open('E:\\Rashi\\octis\\articles_futurism_2022_content.txt','a',encoding="utf-8") as file:
    file.writelines(docs) 

In [None]:
#preprocessData
from octis.preprocessing.preprocessing import Preprocessing
import spacy, string
spacy.load('en_core_web_sm')
from spacy.lang.en import stop_words
stop_words=stop_words.STOP_WORDS


stop_words=list(stop_words)+['share','article','futurism']
preprocessor = Preprocessing(vocabulary=None, max_features=None,
                             remove_punctuation=True, punctuation=string.punctuation,
                             lemmatize=True, stopword_list=stop_words,
                             min_chars=1, min_words_docs=50)

fut_dataset = preprocessor.preprocess_dataset(documents_path='E:\\Rashi\\octis\\articles_futurism_2022_content.txt') 

In [2]:
# #dump/LoadPickleData
import pickle

# # #dumpPickleFile
# with open('E:\\Rashi\\octis\\futurism_2022_content_preprocessed.pkl','wb') as file:
#     pickle.dump(fut_dataset,file)

# # readPickleFile
with open('E:\\Rashi\\octis\\helperFiles\\futurism_2022_content_preprocessed.pkl','rb') as file:
    fut_dataset=pickle.load(file)

coherence = Coherence(texts=fut_dataset.get_corpus(), topk=10, measure='c_v')

In [3]:
fut_dataset

<octis.dataset.dataset.Dataset at 0x1f30b3d4910>

In [None]:
NMF

In [None]:
nmf_model = NMF_scikit(num_topics=37, init='nndsvdar', alpha=0.08)
nmf_output = nmf_model.train_model(fut_dataset)
print('Topics:',*[' '.join(x) for x in nmf_output['topics']],sep='\n')
print(f'\nOverall Coherence Score: {coherence.score(nmf_output)}')

In [None]:
LDA_gibbs

In [3]:
lda_model = LDA_gibbs(num_topics=10, n_iter=200, random_state=100, alpha=0.01, eta=0.001)
lda_output = lda_model.train_model(fut_dataset)
print('Topics:',*[' '.join(x) for x in lda_output['topics']],sep='\n')

coherence = Coherence(texts=fut_dataset.get_corpus(), topk=10, measure='c_v')
kld = KLDivergence()
perplex = Perplexity()
print(f'\nOverall Coherence Score: {coherence.score(lda_output)}  \nKLD Score: {kld.score(lda_output)}  \nPerplexity Score: {perplex.score(lda_output)}')



Topics:
like way world come point life star look water think
good use set printer easy design little work pad material
use year new time energy change system team able long
day prime amazon deal good headphone sound speaker save ear
musk company twitter tesla crypto elon tell ceo report people
game tv gaming good play smart laptop keyboard pro like
battery watch use apple phone charge device pro feature good
space nasa tell launch earth nft human spacex station accord
camera inch light good use quality kid video feature look
software use like good company feature include earbud tool free

Overall Coherence Score: 0.5116750316501677  
KLD Score: 4.068518611226207  
Perplexity Score: {'perplexity': 2.836535637689653e+69, 'AIC': 6707104.847594256, 'BIC': 6707152.240960529}


In [None]:
LDA_tomopy

In [4]:
ldaTomopy_model = LDA_tomopy(num_topics=10, max_iters=100)
ldaTomopy_output = ldaTomopy_model.train_model(fut_dataset)
print('Topics:',*[' '.join(x) for x in ldaTomopy_output['topics']],sep='\n')

coherence = Coherence(texts=fut_dataset.get_corpus(), topk=10, measure='c_v')
kld = KLDivergence()
perplex = Perplexity()
print(f'\nOverall Coherence Score: {coherence.score(ldaTomopy_output)}  \nKLD Score: {kld.score(ldaTomopy_output)}')

Topics:
battery good feature camera use light video kid inch size
use good design keyboard software turntable key include product type
use phone time case apple allow technology like fly wallet
day prime deal amazon good printer save big d print
like pet robot smart vacuum home point cat heat way
musk company twitter tesla tell people crypto elon spacex report
tv sound good headphone pro inch audio smart screen music
space nasa year star earth scientist launch tell human planet
game watch play controller switch good like console gps nintendo
laptop gaming inch device good pad usb cool mouse detector

Overall Coherence Score: 0.5420507718138781  
KLD Score: 3.899872869699644


In [None]:
HPOptimization

In [None]:
nmf_model=NMF_scikit()
search_space = {"num_topics": Categorical([50,70,100,120,150]), "init": Categorical(['nndsvd','nndsvda','nndsvdar']), "alpha": Categorical([0.0,0.25,0.5,0.75,1.0])}
optimizer=Optimizer()

optResult=optimizer.optimize(nmf_model, fut_dataset, coherence, search_space, save_path="./outputFiles", number_of_call=30, model_runs=5)
optResult.save_to_csv("NMF_HOCoherence_futurismResults.csv")

In [None]:
lda_model=LDA_gibbs()
search_space = {"num_topics": Categorical([50,70,100,120,150]), "n_iter": Categorical([100]), "alpha": Categorical([0.01,0.05,0.1,0.5,1.0]), "eta": Categorical([0.001,0.005,0.01,0.05,0.1])}
optimizer=Optimizer()

optResult=optimizer.optimize(lda_model, fut_dataset, coherence, search_space, save_path="./outputFiles", number_of_call=30, model_runs=5)
optResult.save_to_csv("LDAgibbs_HOCoherence_futurismResults.csv")

# optResult=optimizer.optimize(lda_model, fut_dataset, kld, search_space, save_path="./outputFiles", number_of_call=30, model_runs=5)
# optResult.save_to_csv("LDA_HOKld_results.csv")

In [3]:
ldaTomopy_model=LDA_tomopy()
search_space = {"num_topics": Categorical([50,70,100,120,150]), "max_iters": Categorical([100]), "alpha": Categorical([0.01,0.05,0.1,0.5,1.0]), "eta": Categorical([0.001,0.005,0.01,0.05,0.1])}
optimizer=Optimizer()

optResult=optimizer.optimize(ldaTomopy_model, fut_dataset, coherence, search_space, save_path="./outputFiles", number_of_call=30, model_runs=5)
optResult.save_to_csv("E:\\Rashi\\octis\\outputFiles\\Futurism\\LDAtomopy_HOCoherence_futurismResults.csv")

  0%|          | 0/30 [00:00<?, ?it/s]

In [4]:
#sortValues
df=pd.read_csv('E:\\Rashi\\octis\\outputFiles\\Futurism\\NMFscikit_HOCoherence_futurismResults.csv')
df.sort_values(by='Median(model_runs)',inplace=True, ascending=False)
df.to_csv('E:\\Rashi\\octis\\outputFiles\\Futurism\\NMFscikit_HOCoherence_futurismResults.csv')

In [None]:
LDAgibbs - OptimizedHP

In [6]:
#NMFscikit - optimizedHP
nmf_model = NMF_scikit(num_topics=50, init='nndsvda', alpha=0.0)
nmf_output = nmf_model.train_model(fut_dataset)
print('Topics:',*[' '.join(x) for x in nmf_output['topics']],sep='\n')

coherence = Coherence(texts=fut_dataset.get_corpus(), topk=10, measure='c_v')
kld = KLDivergence()

print(f'\nOverall Coherence Score: {coherence.score(nmf_output)}  \nKLD Score: {kld.score(nmf_output)}')

Topics:
prime amazon day deal good save echo big dog espresso
musk twitter elon speech account company tesla tweet platform free
musk spacex elon tesla starship ceo insider employee company woman
station space iss nasa astronaut international crew russia russian cosmonaut
climate change study human researcher scientist disease rogan sea city
game play vr roblox console gamer genre player title good
headphone ear noise earbud sound audio wireless pair music cancel
tv inch k screen qle smart television refresh light rate
tesla drive self car driver vehicle autopilot fsd beta driving
ftx bankman fry sbf sam crypto exchange collapse ceo fund
controller nintendo switch console xbox game gaming x play ole
fusion energy reactor nuclear power livermore breakthrough experiment scientist claim
nft ape bored sell blockchain nfts artist art opensea marketplace
watch apple smartwatch garmin band gps mm fitness series wrist
nasa launch rocket sls moon artemis mission agency lunar starship
lemoine la

In [3]:
#optimizedHP
lda_model = LDA_gibbs(num_topics=120, n_iter=1000, random_state=100, alpha=0.1, eta=0.01)
lda_output = lda_model.train_model(fut_dataset)
print('Topics:',*[' '.join(x) for x in lda_output['topics']],sep='\n')

coherence = Coherence(texts=fut_dataset.get_corpus(), topk=10, measure='c_v')
kld = KLDivergence()

print(f'\nOverall Coherence Score: {coherence.score(lda_output)}  \nKLD Score: {kld.score(lda_output)}')



Topics:
printer print photo hp ink airprint printing canon paper page
star nebulae percent nebula gun black object white cluster emission
user content social platform medium creator youtube tiktok account internet
game controller switch console nintendo play xbox gaming android button
recycle plastic material sustainable company percent product friendly sustainability design
energy fusion nuclear power antimatter reactor claim produce fuel generate
headphone noise earbud ear sound wireless audio pair music cancel
laptop gaming gb graphic ram port card processor pc game
planet earth system solar moon atmosphere sun orbit jupiter large
battery hour long charge life size high small device power
like time think people know actually thing way lot come
grow compost plant composter garden indoor system food soil cut
screen protector iphone glass case protection phone temper protect scratch
like city jetpack space mayman feel drive ground solid foot
mining interface hardware software bitcoin u

In [4]:
#optimizedHP - 2x Topics
lda_model = LDA_gibbs(num_topics=240, n_iter=1000, random_state=100, alpha=0.1, eta=0.01)
lda_output = lda_model.train_model(fut_dataset)
print('Topics:',*[' '.join(x) for x in lda_output['topics']],sep='\n')

coherence = Coherence(texts=fut_dataset.get_corpus(), topk=10, measure='c_v')
kld = KLDivergence()

print(f'\nOverall Coherence Score: {coherence.score(lda_output)}  \nKLD Score: {kld.score(lda_output)}')



Topics:
nebulae nebula star cloud object emission gas fact surround comprise
screen protector glass case iphone protection temper protect pack phone
concert fan ticket ticketmaster swift demand join begin speak venue
game switch nintendo console classic title ole retro include experience
high low provide run type unit cost performance brand power
pet camera dog treat cat toy subscription petcube control parent
webb telescope nasa space james image hubble year early agency
weight pound light easy inch size need level small heavy
build create day long like way goal time thing fast
ftx bankman fry exchange crypto sbf sam ceo collapse fund
cable flow iii mess wrap realize tie thick braid durability
gaming pc graphic core machine processor high rate port usb
creator site trump deepfake porn royalty adult digital content account
tv channel live sport offer news cable month content hulu
world year like real time long life early old come
press record ice past bad ap sea level associate break
f

In [None]:
pip install "gensim>=4.2.0,<5.0" nltk pandas spacy "scikit-learn==1.1.0" "scikit-optimize>=0.8.1" matplotlib torch "numpy>=1.23.0,<2.0" libsvm flask sentence-transformers requests tomotopy "scipy<1.13"


In [5]:
#LDAtomopy - optimizedHP
ldaTomopy_model = LDA_tomopy(num_topics=70, max_iters=100, alpha=0.1, eta=0.05)
ldaTomopy_output = ldaTomopy_model.train_model(fut_dataset)
print('Topics:',*[' '.join(x) for x in ldaTomopy_output['topics']],sep='\n')

coherence = Coherence(texts=fut_dataset.get_corpus(), topk=10, measure='c_v')
kld = KLDivergence()

print(f'\nOverall Coherence Score: {coherence.score(ldaTomopy_output)}  \nKLD Score: {kld.score(ldaTomopy_output)}')

Topics:
green grandpad case attorney cluster court star contact claim audible
selling key point gift good hair kit include love great
snow b ufos osiris hot star mcafee nebula cygni bellerophon
doorbell smart video hp home ring security eufy blink storage
oven cook cooking food kitchen heat grill temperature breville recipe
aid hearing hear ear model warranty battery fit bluetooth sound
printer d print printing piece panel resin anycubic screen fitness
kid tablet child age app screen young parent old senior
controller switch xbox game nintendo button play wireless gaming bluetooth
vyper bubble airtag skin sunscreen spray reznor sun plate water
good feature pro use price high include cut model spec
meta metaverse facebook zuckerberg virtual vr tiktok mark reality datum
space russia station russian iss ukraine rogozin international nasa astronaut
bot chatbot ai user facial post search photo chinese recognition
light energy fusion lead strip power ray nuclear lighting sonos
hdmi modulator