# Preparing environment

In [1]:
import os
import time
import numpy as np
import pandas as pd

# Dataset
from octis.dataset.dataset import Dataset

# Metrics
from custom.metrics.TDCI import TDCI
from octis.evaluation_metrics.coherence_metrics import Coherence
from octis.evaluation_metrics.diversity_metrics import TopicDiversity

# Topic Models
from octis.models.CTM import CTM
from octis.models.ETM import ETM
from octis.models.HDP import HDP
from octis.models.LDA import LDA
from octis.models.LSI import LSI
from octis.models.NMF import NMF
from octis.models.ProdLDA import ProdLDA
from octis.models.NeuralLDA import NeuralLDA
from custom.models.CustomTop2Vec import CustomTop2Vec
from custom.models.CustomBERTopic import CustomBERTopic

  @numba.jit()
  @numba.jit()
  @numba.jit()
  @numba.jit()


In [2]:
def remove_unused():
    try:
        os.remove("_train.pkl")
        os.remove("_test.pkl")
        os.remove("_val.pkl")
    except:
        pass

# Datasets

In [3]:
# Load PL 3723/2019
dataset_pl = Dataset()
dataset_pl.load_custom_dataset_from_folder("../datasets/pl_3723_2019")

# Load PEC 471/2005
dataset_pec = Dataset()
dataset_pec.load_custom_dataset_from_folder("../datasets/pec_471_2005")

In [4]:
print("Size of PL 3723/2019 corpus: ", len(dataset_pl.get_corpus()))
print("Size of PL 3723/2019 vocabulary: ", len(dataset_pl.get_vocabulary()))

print("Mean document length of PL 3723/2019: ", np.mean([len(doc) for doc in dataset_pl.get_corpus()]))
print("Min and max document length of PL 3723/2019: ", np.min([len(doc) for doc in dataset_pl.get_corpus()]), np.max([len(doc) for doc in dataset_pl.get_corpus()]))

Size of PL 3723/2019 corpus:  705
Size of PL 3723/2019 vocabulary:  2131
Mean document length of PL 3723/2019:  10.44822695035461
Min and max document length of PL 3723/2019:  1 37


In [5]:
print("Size of PEC 471/2005 corpus: ", len(dataset_pec.get_corpus()))
print("Size of PEC 471/2005 vocabulary: ", len(dataset_pec.get_vocabulary()))
print("Mean document length of PEC 471/2005: ", np.mean([len(doc) for doc in dataset_pec.get_corpus()]))
print("Min and max document length of PEC 471/2005: ", np.min([len(doc) for doc in dataset_pec.get_corpus()]), np.max([len(doc) for doc in dataset_pec.get_corpus()]))

Size of PEC 471/2005 corpus:  628
Size of PEC 471/2005 vocabulary:  2088
Mean document length of PEC 471/2005:  11.546178343949045
Min and max document length of PEC 471/2005:  1 48


# Metrics

In [6]:
tc_pl_metric = Coherence(texts=dataset_pl.get_corpus())
tdci_pl_metric = TDCI(texts=dataset_pl.get_corpus())
td_pl_metric = TopicDiversity()

In [7]:
tc_pec_metric = Coherence(texts=dataset_pec.get_corpus())
tdci_pec_metric = TDCI(texts=dataset_pec.get_corpus())
td_pec_metric = TopicDiversity()

In [56]:
def get_best_df(alg_name, dataset_name, use_topics=True):
    for n_topic in [10, 20, 30, 40, 50]:
        if n_topic == 10:
            df = pd.DataFrame(pd.read_csv("../tunning/csv/" + alg_name + "-" + dataset_name + ("-" + str(n_topic) if use_topics else "") + ".csv").sort_values(
                by="Mean(model_runs)", ascending=False).head(1).reset_index(drop=True))
            
        else:
            vals = pd.DataFrame(pd.read_csv("../tunning/csv/" + alg_name + "-" + dataset_name + ("-" + str(n_topic) if use_topics else "") + ".csv").sort_values(
                by="Mean(model_runs)", ascending=False).head(1).reset_index(drop=True)).to_dict()
        
            df.loc[len(df)] = {key: value[0] for key, value in vals.items()}

    return df.sort_values(by="Mean(model_runs)", ascending=False).reset_index(drop=True)

In [64]:
# Get first row of df as dict
def get_hyperparams(df):
    return {key: value[0] for key, value in df[df.columns[8:-2]].head(1).to_dict().items()}

# Visualize - PL 3723/2019

In [61]:
# Get CTM best hyperparameters
ctm_pl_df = get_best_df("ctm", "pl_3723_2019")
ctm_pl_df

Unnamed: 0,dataset,surrogate model,acquisition function,num_iteration,time,Median(model_runs),Mean(model_runs),Standard_Deviation(model_runs),activation,dropout,inference_type,model_type,num_epochs,num_layers,num_neurons,num_samples,num_topics,solver,Coherence(not optimized),Topic diversity(not optimized)
0,dataset_name,RF,LCB,6,575.039418,0.579217,0.575805,0.012693,relu,0.711358,combined,LDA,367,1,200,50,30,sgd,-0.104516,0.84
1,dataset_name,RF,LCB,7,918.352537,0.563788,0.56302,0.005896,tanh,0.802842,combined,prodLDA,119,4,300,40,50,sgd,-0.137433,0.784
2,dataset_name,RF,LCB,7,898.508395,0.56219,0.562374,0.019157,tanh,0.802842,combined,prodLDA,119,4,300,40,40,sgd,-0.133267,0.8225
3,dataset_name,RF,LCB,9,1180.754644,0.533357,0.557952,0.038796,relu,0.732236,combined,prodLDA,338,4,200,49,20,sgd,-0.232828,0.875
4,dataset_name,RF,LCB,7,69.862385,0.53907,0.535245,0.01126,tanh,0.802842,combined,prodLDA,119,4,300,40,10,sgd,-0.244218,0.95


In [66]:
# Get ETM best hyperparameters
etm_pl_df = get_best_df("etm", "pl_3723_2019")
etm_pl_df

Unnamed: 0,dataset,surrogate model,acquisition function,num_iteration,time,Median(model_runs),Mean(model_runs),Standard_Deviation(model_runs),activation,dropout,num_epochs,num_neurons,num_topics,optimizer,rho,t_hidden_size,Coherence(not optimized),Topic diversity(not optimized)
0,dataset_name,RF,LCB,2,71.796367,0.424573,0.424196,0.014654,relu,0.30775,137,100,10,asgd,296,487,-0.330412,0.63
1,dataset_name,RF,LCB,7,1120.583583,0.404651,0.406584,0.005756,softplus,0.286217,240,100,20,asgd,388,440,-0.348936,0.57
2,dataset_name,RF,LCB,4,127.74509,0.398066,0.397826,0.000982,softplus,0.900379,274,100,30,asgd,367,471,-0.443223,0.703333
3,dataset_name,RF,LCB,6,204.555664,0.388241,0.388373,0.0008,softplus,0.150885,351,100,50,asgd,293,786,-0.463674,0.698
4,dataset_name,RF,LCB,4,136.023815,0.383294,0.383902,0.004205,softplus,0.900379,274,100,40,asgd,367,471,-0.491854,0.78


In [12]:
# Get HDP best hyperparameters
hdp_pl_df = get_best_df("hdp", "pl_3723_2019", use_topics=False)
hdp_pl_df

Unnamed: 0,dataset,surrogate model,acquisition function,num_iteration,time,Median(model_runs),Mean(model_runs),Standard_Deviation(model_runs),kappa,tau,Coherence(not optimized),Topic diversity(not optimized)
0,dataset_name,RF,LCB,3,18.402431,0.3617,0.362193,0.001432,0.52115,72.429074,-0.504912,0.682
1,dataset_name,RF,LCB,3,18.402431,0.3617,0.362193,0.001432,0.52115,72.429074,-0.504912,0.682
2,dataset_name,RF,LCB,3,18.402431,0.3617,0.362193,0.001432,0.52115,72.429074,-0.504912,0.682
3,dataset_name,RF,LCB,3,18.402431,0.3617,0.362193,0.001432,0.52115,72.429074,-0.504912,0.682
4,dataset_name,RF,LCB,3,18.402431,0.3617,0.362193,0.001432,0.52115,72.429074,-0.504912,0.682


In [71]:
# Get LDA best hyperparameters
lda_pl_df = get_best_df("lda", "pl_3723_2019")
lda_pl_df

Unnamed: 0,dataset,surrogate model,acquisition function,num_iteration,time,Median(model_runs),Mean(model_runs),Standard_Deviation(model_runs),alpha,chunksize,decay,gamma_threshold,iterations,num_topics,offset,passes,Coherence(not optimized),Topic diversity(not optimized)
0,dataset_name,RF,LCB,1,320.162946,0.648233,0.634026,0.029963,asymmetric,1363,0.634605,0.458661,12,20,8.260694,383,0.056237,0.78
1,dataset_name,RF,LCB,0,79.917479,0.623739,0.624827,0.006107,asymmetric,41715,0.534526,0.320128,488,30,2.20302,84,0.050879,0.75
2,dataset_name,RF,LCB,6,206.380311,0.62223,0.613843,0.016544,asymmetric,2644,0.726734,0.25291,430,10,9.909233,224,-0.022234,0.88
3,dataset_name,RF,LCB,5,1946.726467,0.585922,0.581863,0.02027,symmetric,15623,0.875413,0.054385,632,50,5.997963,428,-0.023105,0.732
4,dataset_name,RF,LCB,0,70.710448,0.579382,0.577358,0.006869,asymmetric,41715,0.534526,0.320128,488,40,2.20302,84,0.026591,0.665


In [72]:
# Get LSI best hyperparameters
lsi_pl_df = get_best_df("lsi", "pl_3723_2019")
lsi_pl_df

Unnamed: 0,dataset,surrogate model,acquisition function,num_iteration,time,Median(model_runs),Mean(model_runs),Standard_Deviation(model_runs),chunksize,decay,extra_samples,num_topics,power_iters,Coherence(not optimized),Topic diversity(not optimized)
0,dataset_name,RF,LCB,6,29.471644,0.457595,0.477086,0.029005,37889,0.748736,33,10,1,-0.004537,0.43
1,dataset_name,RF,LCB,2,29.625747,0.410358,0.410266,0.009478,34399,0.779017,147,20,6,-0.106995,0.38
2,dataset_name,RF,LCB,9,233.48805,0.35264,0.352373,0.002678,42688,0.650908,19,40,1,-0.209854,0.32
3,dataset_name,RF,LCB,3,42.899925,0.345518,0.351764,0.014632,30062,0.872725,51,30,2,-0.147643,0.29
4,dataset_name,RF,LCB,8,307.47156,0.324974,0.327507,0.011942,35816,0.72096,169,50,1,-0.240044,0.286


In [73]:
# Get NeuralLDA best hyperparameters
neurallda_pl_df = get_best_df("neurallda", "pl_3723_2019")
neurallda_pl_df

Unnamed: 0,dataset,surrogate model,acquisition function,num_iteration,time,Median(model_runs),Mean(model_runs),Standard_Deviation(model_runs),activation,dropout,lr,momentum,num_epochs,num_layers,num_neurons,num_samples,num_topics,solver,Coherence(not optimized),Topic diversity(not optimized)
0,dataset_name,RF,LCB,9,643.911896,0.561857,0.565034,0.019965,relu,0.43251,0.008744,0.796968,114,1,950,42,50,adam,-0.155814,0.84
1,dataset_name,RF,LCB,7,163.790888,0.494397,0.495968,0.004017,relu,0.26638,0.009618,0.897813,74,1,580,42,20,sgd,-0.304526,0.845
2,dataset_name,RF,LCB,3,71.80403,0.490811,0.492855,0.003417,relu,0.759705,0.00419,0.511511,348,4,547,49,10,sgd,-0.283882,0.78
3,dataset_name,RF,LCB,9,21.203326,0.481545,0.477085,0.007745,relu,0.937184,0.009627,0.523781,287,9,461,39,30,adam,-0.277063,0.686667
4,dataset_name,RF,LCB,7,1578.883242,0.450504,0.450578,0.001467,relu,0.26638,0.009618,0.897813,74,1,580,42,40,sgd,-0.343345,0.7175


In [74]:
# Get NMF best hyperparameters
nmf_pl_df = get_best_df("nmf", "pl_3723_2019")
nmf_pl_df

Unnamed: 0,dataset,surrogate model,acquisition function,num_iteration,time,Median(model_runs),Mean(model_runs),Standard_Deviation(model_runs),chunksize,h_max_iter,h_stop_condition,kappa,minimum_probability,num_topics,passes,w_max_iter,w_stop_condition,Coherence(not optimized),Topic diversity(not optimized)
0,dataset_name,RF,LCB,1,943.026366,0.601655,0.595579,0.020954,1360,92,0.00917,0.807165,0.081825,40,443,109,0.000352,0.052257,0.7025
1,dataset_name,RF,LCB,5,1032.986445,0.588372,0.591804,0.006343,5629,54,0.007353,0.81576,0.056435,50,447,179,0.000813,0.035367,0.684
2,dataset_name,RF,LCB,4,336.492251,0.576237,0.582421,0.012529,9225,31,0.007002,0.83672,0.092007,20,178,486,0.000689,-0.03484,0.715
3,dataset_name,RF,LCB,7,419.437063,0.578174,0.578719,0.002948,7825,37,0.007505,0.849872,0.049999,10,236,491,0.000854,-0.051151,0.75
4,dataset_name,RF,LCB,5,829.656851,0.568496,0.575207,0.011702,5629,54,0.007353,0.81576,0.056435,30,447,179,0.000813,-0.038751,0.696667


In [75]:
# Get ProdLDA best hyperparameters
prodlda_pl_df = get_best_df("prodlda", "pl_3723_2019")
prodlda_pl_df

Unnamed: 0,dataset,surrogate model,acquisition function,num_iteration,time,Median(model_runs),Mean(model_runs),Standard_Deviation(model_runs),activation,dropout,lr,momentum,num_epochs,num_layers,num_neurons,num_samples,num_topics,solver,Coherence(not optimized),Topic diversity(not optimized)
0,dataset_name,RF,LCB,7,18.28143,0.569025,0.566211,0.007137,relu,0.26638,0.009618,0.897813,74,1,580,42,20,sgd,-0.125793,0.795
1,dataset_name,RF,LCB,1,17.25954,0.56671,0.558785,0.012177,softplus,0.705528,0.007475,0.550202,134,5,392,34,30,sgd,-0.164152,0.853333
2,dataset_name,RF,LCB,9,214.915105,0.551473,0.552922,0.002179,softplus,0.816766,0.00839,0.687112,409,2,350,49,10,adam,-0.169552,0.82
3,dataset_name,RF,LCB,7,693.069219,0.556317,0.552101,0.013977,relu,0.26638,0.009618,0.897813,74,1,580,42,40,sgd,-0.114227,0.745
4,dataset_name,RF,LCB,9,109.200047,0.546379,0.542963,0.00571,softplus,0.874888,0.005326,0.678754,266,4,331,27,50,sgd,-0.176594,0.812


# Visualize - PEC 471/2005

In [76]:
# Get CTM best hyperparameters
ctm_pec_df = get_best_df("ctm", "pec_471_2005")
ctm_pec_df

Unnamed: 0,dataset,surrogate model,acquisition function,num_iteration,time,Median(model_runs),Mean(model_runs),Standard_Deviation(model_runs),activation,dropout,inference_type,model_type,num_epochs,num_layers,num_neurons,num_samples,num_topics,solver,Coherence(not optimized),Topic diversity(not optimized)
0,dataset_name,RF,LCB,8,20.962127,0.554368,0.553415,0.023603,sigmoid,0.222712,combined,prodLDA,324,4,100,50,30,sgd,-0.140262,0.79
1,dataset_name,RF,LCB,9,18.139311,0.554548,0.54879,0.011307,rrelu,0.7337,zeroshot,LDA,304,3,200,45,40,sgd,-0.169758,0.815
2,dataset_name,RF,LCB,9,25.456002,0.549203,0.546692,0.021639,selu,0.145278,combined,prodLDA,475,2,50,45,50,sgd,-0.119633,0.73
3,dataset_name,RF,LCB,5,16.641629,0.535274,0.535867,0.011771,rrelu,0.339496,zeroshot,prodLDA,118,4,50,49,20,sgd,-0.230781,0.88
4,dataset_name,RF,LCB,6,28.694756,0.517234,0.516988,0.001951,leakyrelu,0.138377,combined,prodLDA,359,1,300,48,10,sgd,-0.283527,0.93


In [77]:
# Get ETM best hyperparameters
etm_pec_df = get_best_df("etm", "pec_471_2005")
etm_pec_df

Unnamed: 0,dataset,surrogate model,acquisition function,num_iteration,time,Median(model_runs),Mean(model_runs),Standard_Deviation(model_runs),activation,dropout,num_epochs,num_neurons,num_topics,optimizer,rho,t_hidden_size,Coherence(not optimized),Topic diversity(not optimized)
0,dataset_name,RF,LCB,9,126.946013,0.432631,0.430806,0.003768,relu,0.81949,227,100,10,adadelta,223,874,-0.30277,0.57
1,dataset_name,RF,LCB,4,123.044231,0.417897,0.416041,0.003818,softplus,0.900379,274,100,20,asgd,367,471,-0.371115,0.63
2,dataset_name,RF,LCB,1,183.293477,0.404854,0.404556,0.004773,relu,0.705528,357,200,30,adadelta,245,775,-0.413784,0.68
3,dataset_name,RF,LCB,1,188.099021,0.390607,0.391055,0.0009,relu,0.705528,357,200,40,adadelta,245,775,-0.482816,0.795
4,dataset_name,RF,LCB,1,201.582606,0.3797,0.380264,0.001249,relu,0.705528,357,200,50,adadelta,245,775,-0.504805,0.814


In [78]:
# Get HDP best hyperparameters
hdp_pec_df = get_best_df("hdp", "pec_471_2005", use_topics=False)
hdp_pec_df

Unnamed: 0,dataset,surrogate model,acquisition function,num_iteration,time,Median(model_runs),Mean(model_runs),Standard_Deviation(model_runs),kappa,tau,Coherence(not optimized),Topic diversity(not optimized)
0,dataset_name,RF,LCB,7,18.734225,0.403176,0.402933,0.001337,0.66803,8.229062,-0.39879,0.612
1,dataset_name,RF,LCB,7,18.734225,0.403176,0.402933,0.001337,0.66803,8.229062,-0.39879,0.612
2,dataset_name,RF,LCB,7,18.734225,0.403176,0.402933,0.001337,0.66803,8.229062,-0.39879,0.612
3,dataset_name,RF,LCB,7,18.734225,0.403176,0.402933,0.001337,0.66803,8.229062,-0.39879,0.612
4,dataset_name,RF,LCB,7,18.734225,0.403176,0.402933,0.001337,0.66803,8.229062,-0.39879,0.612


In [79]:
# Get LDA best hyperparameters
lda_pec_df = get_best_df("lda", "pec_471_2005")
lda_pec_df

Unnamed: 0,dataset,surrogate model,acquisition function,num_iteration,time,Median(model_runs),Mean(model_runs),Standard_Deviation(model_runs),alpha,chunksize,decay,gamma_threshold,iterations,num_topics,offset,passes,Coherence(not optimized),Topic diversity(not optimized)
0,dataset_name,RF,LCB,0,55.186172,0.612513,0.616814,0.010885,asymmetric,41715,0.534526,0.320128,488,20,2.20302,84,-0.036033,0.85
1,dataset_name,RF,LCB,4,268.01031,0.623209,0.61477,0.053697,asymmetric,7993,0.723881,0.186447,650,10,9.408266,433,0.046849,0.77
2,dataset_name,RF,LCB,5,326.157101,0.5881,0.598738,0.027171,asymmetric,15623,0.875413,0.054385,632,30,5.997963,428,0.007209,0.75
3,dataset_name,RF,LCB,2,187.25063,0.571119,0.572146,0.008345,symmetric,22107,0.518447,0.595289,470,40,3.999333,264,-0.10612,0.8225
4,dataset_name,RF,LCB,2,205.281859,0.565017,0.559815,0.008966,symmetric,22107,0.518447,0.595289,470,50,3.999333,264,-0.099284,0.756


In [80]:
# Get LSI best hyperparameters
lsi_pec_df = get_best_df("lsi", "pec_471_2005")
lsi_pec_df

Unnamed: 0,dataset,surrogate model,acquisition function,num_iteration,time,Median(model_runs),Mean(model_runs),Standard_Deviation(model_runs),chunksize,decay,extra_samples,num_topics,Coherence(not optimized),Topic diversity(not optimized)
0,dataset_name,RF,LCB,7,15.195222,0.492293,0.489481,0.011413,14365,0.616962,144,10,-0.06522,0.52
1,dataset_name,RF,LCB,8,28.068654,0.422516,0.422078,0.013739,31512,0.815253,19,20,-0.128358,0.41
2,dataset_name,RF,LCB,1,41.530271,0.381461,0.380889,0.000891,16295,0.647302,174,30,-0.171093,0.353333
3,dataset_name,RF,LCB,5,54.425396,0.340164,0.342066,0.007053,7655,0.959472,31,40,-0.222919,0.3025
4,dataset_name,RF,LCB,8,67.69895,0.331443,0.32954,0.010237,1688,0.981793,22,50,-0.2595,0.3


In [81]:
# Get NeuralLDA best hyperparameters
neurallda_pec_df = get_best_df("neurallda", "pec_471_2005")
neurallda_pec_df

Unnamed: 0,dataset,surrogate model,acquisition function,num_iteration,time,Median(model_runs),Mean(model_runs),Standard_Deviation(model_runs),activation,dropout,lr,momentum,num_epochs,num_layers,num_neurons,num_samples,num_topics,solver,Coherence(not optimized),Topic diversity(not optimized)
0,dataset_name,RF,LCB,7,21.426088,0.552825,0.547277,0.012768,softplus,0.26638,0.009618,0.897813,74,1,580,42,50,adam,-0.196502,0.87
1,dataset_name,RF,LCB,7,20.151407,0.531959,0.534612,0.004213,softplus,0.26638,0.009618,0.897813,74,1,580,42,20,adam,-0.254604,0.925
2,dataset_name,RF,LCB,7,19.499985,0.53293,0.532157,0.01144,softplus,0.26638,0.009618,0.897813,74,1,580,42,40,adam,-0.239324,0.89
3,dataset_name,RF,LCB,7,15.467923,0.526579,0.531731,0.011457,softplus,0.26638,0.009618,0.897813,74,1,580,42,30,adam,-0.260138,0.913333
4,dataset_name,RF,LCB,6,18.64149,0.489514,0.490153,0.007994,relu,0.147565,0.004482,0.841802,257,3,792,37,10,sgd,-0.349717,0.96


In [82]:
# Get NMF best hyperparameters
nmf_pec_df = get_best_df("nmf", "pec_471_2005")
nmf_pec_df

Unnamed: 0,dataset,surrogate model,acquisition function,num_iteration,time,Median(model_runs),Mean(model_runs),Standard_Deviation(model_runs),chunksize,h_max_iter,h_stop_condition,kappa,minimum_probability,num_topics,passes,w_max_iter,w_stop_condition,Coherence(not optimized),Topic diversity(not optimized)
0,dataset_name,RF,LCB,0,50.622741,0.602489,0.599376,0.010553,4820,85,0.001621,0.659724,0.053667,10,107,167,0.000259,-0.001883,0.76
1,dataset_name,RF,LCB,2,116.571098,0.591433,0.594204,0.004278,5273,80,0.009716,0.662757,0.057977,30,224,483,0.000852,0.005462,0.726667
2,dataset_name,RF,LCB,5,203.840664,0.580978,0.585311,0.006213,5629,54,0.007353,0.81576,0.056435,20,447,179,0.000813,-0.024705,0.725
3,dataset_name,RF,LCB,5,240.655395,0.582812,0.584749,0.002792,5629,54,0.007353,0.81576,0.056435,50,447,179,0.000813,0.038024,0.674
4,dataset_name,RF,LCB,7,212.036629,0.579851,0.582878,0.012854,6693,64,0.00775,0.730072,0.030988,40,401,111,0.000424,0.010828,0.68


In [83]:
# Get ProdLDA best hyperparameters
prodlda_pec_df = get_best_df("prodlda", "pec_471_2005")
prodlda_pec_df

Unnamed: 0,dataset,surrogate model,acquisition function,num_iteration,time,Median(model_runs),Mean(model_runs),Standard_Deviation(model_runs),activation,dropout,lr,momentum,num_epochs,num_layers,num_neurons,num_samples,num_topics,solver,Coherence(not optimized),Topic diversity(not optimized)
0,dataset_name,RF,LCB,7,22.548415,0.642022,0.643323,0.001986,softplus,0.26638,0.009618,0.897813,74,1,580,42,50,adam,-0.021763,0.934
1,dataset_name,RF,LCB,7,20.963147,0.635366,0.640111,0.011816,softplus,0.26638,0.009618,0.897813,74,1,580,42,40,adam,-0.037662,0.935
2,dataset_name,RF,LCB,7,21.374371,0.605337,0.608147,0.004622,softplus,0.26638,0.009618,0.897813,74,1,580,42,30,adam,-0.102606,0.93
3,dataset_name,RF,LCB,7,17.498507,0.603172,0.602726,0.013716,softplus,0.26638,0.009618,0.897813,74,1,580,42,10,adam,-0.116285,0.99
4,dataset_name,RF,LCB,7,21.607066,0.595421,0.593876,0.003737,softplus,0.245056,0.004834,0.883513,397,1,900,32,20,adam,-0.126328,0.945


# Training best models

In [84]:
model_names = ["CTM", "ETM", "HDP", "LDA", "LSI", "NeuralLDA", "NMF", "ProdLDA", "BERTopic", "Top2Vec"]

In [85]:
def eval_model(model, dataset, tdci, td, tc):
    """
    Evaluate a model using the metrics defined in the beginning of this notebook.

    Returns:
        tdci_results: TDCI score
        coherence_results: Coherence score
        diversity_results: Diversity score
        topics: Topics
        wall_time: Wall time
    """

    res = dict()

    try:
        start = time.time()
        results = model.train_model(dataset)
        end = time.time()

        coherence_results = tc.score(results)
        diversity_results = td.score(results)
        tdci_results = tdci.score(results)

        remove_unused()

        res['tdci'] = tdci_results
        res['coherence'] = coherence_results
        res['diversity'] = diversity_results
        res['topics'] = len(results['topics'])
        res['wall_time'] = end - start
    
    except:

        remove_unused()
        res['tdci'] = 0
        res['coherence'] = 0
        res['diversity'] = 0
        res['topics'] = 0
        res['wall_time'] = 0

    return res

## PL

In [86]:
ctm_pl_model = CTM(**get_hyperparams(ctm_pl_df))
etm_pl_model = ETM(**get_hyperparams(etm_pl_df.drop(columns=['num_neurons', 'rho'])))
hdp_pl_model = HDP(**get_hyperparams(hdp_pl_df))
lda_pl_model = LDA(**get_hyperparams(lda_pl_df))
lsi_pl_model = LSI(**get_hyperparams(lsi_pl_df))
neurallda_pl_model = NeuralLDA(**get_hyperparams(neurallda_pl_df))
nmf_pl_model = NMF(**get_hyperparams(nmf_pl_df))
prodlda_pl_model = ProdLDA(**get_hyperparams(prodlda_pl_df))
bertopic_pl_model = CustomBERTopic()
top2vec_pl_model = CustomTop2Vec()

pl_models = [ctm_pl_model, etm_pl_model, hdp_pl_model, lda_pl_model, lsi_pl_model, neurallda_pl_model, nmf_pl_model, prodlda_pl_model, bertopic_pl_model, top2vec_pl_model]

In [89]:
%%capture

df_test_pl = pd.DataFrame(
    {"Model": [], "TDCI": [], "TD": [], "TC": [], "Topics": [], "Wall time": []})

for i in range(10):
    print("=============")
    print("Iteration" +  str(i))
    print("=============")
    
    for model, name in zip(pl_models, model_names):
        res = eval_model(model, dataset_pl, tdci_pl_metric, td_pl_metric, tc_pl_metric)

        df_test_pl.loc[len(df_test_pl)] = [name, res['tdci'], res['diversity'], res['coherence'], res['topics'], res['wall_time']]
        

2023-07-03 14:58:31,592 - top2vec - INFO - Pre-processing documents for training
2023-07-03 14:58:31,609 - top2vec - INFO - Creating joint document/word embedding
2023-07-03 14:58:32,813 - top2vec - INFO - Creating lower dimension embedding of documents
2023-07-03 14:58:36,201 - top2vec - INFO - Finding dense areas of documents
2023-07-03 14:58:36,218 - top2vec - INFO - Finding topics
2023-07-03 15:04:15,015 - top2vec - INFO - Pre-processing documents for training
2023-07-03 15:04:15,033 - top2vec - INFO - Creating joint document/word embedding
2023-07-03 15:04:16,241 - top2vec - INFO - Creating lower dimension embedding of documents
2023-07-03 15:04:19,441 - top2vec - INFO - Finding dense areas of documents
2023-07-03 15:04:19,459 - top2vec - INFO - Finding topics
2023-07-03 15:10:01,431 - top2vec - INFO - Pre-processing documents for training
2023-07-03 15:10:01,451 - top2vec - INFO - Creating joint document/word embedding
2023-07-03 15:10:02,738 - top2vec - INFO - Creating lower dim

In [90]:
df_test_pl

Unnamed: 0,Model,TDCI,TD,TC,Topics,Wall time
0,CTM,0.583627,0.853333,-0.113071,30,35.634659
1,ETM,0.425183,0.660000,-0.372785,10,25.938432
2,HDP,0.361276,0.668000,-0.504820,150,2.488105
3,LDA,0.631062,0.810000,0.033756,20,82.498787
4,LSI,0.461292,0.430000,-0.005009,10,4.649957
...,...,...,...,...,...,...
95,NeuralLDA,0.505984,0.816000,-0.266648,50,12.848052
96,NMF,0.579186,0.692500,-0.004519,40,72.388810
97,ProdLDA,0.561492,0.780000,-0.122763,20,29.094626
98,BERTopic,0.558357,0.709091,-0.079055,11,24.371935


## PEC

In [91]:
ctm_pec_model = CTM(**get_hyperparams(ctm_pec_df))
etm_pec_model = ETM(**get_hyperparams(etm_pec_df.drop(columns=['num_neurons', 'rho'])))
hdp_pec_model = HDP(**get_hyperparams(hdp_pec_df))
lda_pec_model = LDA(**get_hyperparams(lda_pec_df))
lsi_pec_model = LSI(**get_hyperparams(lsi_pec_df))
neurallda_pec_model = NeuralLDA(**get_hyperparams(neurallda_pec_df))
nmf_pec_model = NMF(**get_hyperparams(nmf_pec_df))
prodlda_pec_model = ProdLDA(**get_hyperparams(prodlda_pec_df))
bertopic_pec_model = CustomBERTopic()
top2vec_pec_model = CustomTop2Vec()

pec_models = [ctm_pec_model, etm_pec_model, hdp_pec_model, lda_pec_model, lsi_pec_model, neurallda_pec_model, nmf_pec_model, prodlda_pec_model, bertopic_pec_model, top2vec_pec_model]

In [92]:
%%capture

df_test_pec = pd.DataFrame(
    {"Model": [], "TDCI": [], "TD": [], "TC": [], "Topics": [], "Wall time": []})

for i in range(10):
    print("=============")
    print("Iteration" +  str(i))
    print("=============")
    
    for model, name in zip(pec_models, model_names):
        res = eval_model(model, dataset_pec, tdci_pec_metric, td_pec_metric, tc_pec_metric)

        df_test_pec.loc[len(df_test_pec)] = [name, res['tdci'], res['diversity'], res['coherence'], res['topics'], res['wall_time']]

2023-07-03 15:54:47,248 - top2vec - INFO - Pre-processing documents for training
2023-07-03 15:54:47,266 - top2vec - INFO - Creating joint document/word embedding
2023-07-03 15:54:48,560 - top2vec - INFO - Creating lower dimension embedding of documents
2023-07-03 15:54:52,042 - top2vec - INFO - Finding dense areas of documents
2023-07-03 15:54:52,059 - top2vec - INFO - Finding topics
2023-07-03 15:59:25,510 - top2vec - INFO - Pre-processing documents for training
2023-07-03 15:59:25,530 - top2vec - INFO - Creating joint document/word embedding
2023-07-03 15:59:27,139 - top2vec - INFO - Creating lower dimension embedding of documents
2023-07-03 15:59:29,953 - top2vec - INFO - Finding dense areas of documents
2023-07-03 15:59:29,968 - top2vec - INFO - Finding topics
2023-07-03 16:04:38,657 - top2vec - INFO - Pre-processing documents for training
2023-07-03 16:04:38,675 - top2vec - INFO - Creating joint document/word embedding
2023-07-03 16:04:41,050 - top2vec - INFO - Creating lower dim

In [93]:
# Group by model name
df_test_pec

Unnamed: 0,Model,TDCI,TD,TC,Topics,Wall time
0,CTM,0.515293,0.826667,-0.251388,30,48.723111
1,ETM,0.425262,0.480000,-0.236538,10,94.765039
2,HDP,0.409201,0.612667,-0.385629,150,1.992559
3,LDA,0.621328,0.865000,-0.030464,20,16.790795
4,LSI,0.488022,0.530000,-0.095590,10,4.549821
...,...,...,...,...,...,...
95,NeuralLDA,0.571878,0.878000,-0.151931,50,33.074673
96,NMF,0.570585,0.740000,-0.071419,10,15.294396
97,ProdLDA,0.649306,0.932000,-0.003612,50,30.208863
98,BERTopic,0.584720,0.950000,-0.155338,2,24.127105


In [94]:
df_test_pl.to_csv("../output/df_test_pl.csv")
df_test_pec.to_csv("../output/df_test_pec.csv")

In [95]:
df_test_pl.groupby("Model").mean().sort_values(by="TDCI", ascending=False)

Unnamed: 0_level_0,TDCI,TD,TC,Topics,Wall time
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
LDA,0.624661,0.7955,0.029507,20.0,82.508446
NMF,0.579482,0.689,0.000329,40.0,72.559644
ProdLDA,0.565963,0.812,-0.129953,20.0,22.456861
BERTopic,0.559753,0.712183,-0.076876,13.0,23.522111
CTM,0.554907,0.838667,-0.1702,30.0,53.036924
NeuralLDA,0.535344,0.8254,-0.206994,50.0,16.43259
LSI,0.468432,0.446,-0.010379,10.0,4.647132
Top2Vec,0.433561,0.421667,0.010385,2.2,4.072674
ETM,0.418992,0.598,-0.352648,10.0,43.880195
HDP,0.363204,0.673933,-0.502804,150.0,2.425779


In [96]:
df_test_pec.groupby("Model").mean().sort_values(by="TDCI", ascending=False)

Unnamed: 0_level_0,TDCI,TD,TC,Topics,Wall time
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ProdLDA,0.64073,0.9176,-0.015382,50.0,32.006001
LDA,0.601382,0.863,-0.075656,20.0,16.866525
NMF,0.584596,0.75,-0.041169,10.0,15.026641
BERTopic,0.571577,0.95,-0.182378,2.0,26.563039
NeuralLDA,0.556752,0.8716,-0.181828,50.0,28.059425
CTM,0.53798,0.791333,-0.184208,30.0,85.915711
LSI,0.482549,0.5,-0.06547,10.0,4.514212
ETM,0.424168,0.56,-0.309805,10.0,117.875901
HDP,0.402751,0.6048,-0.396197,150.0,2.097474
Top2Vec,0.0,0.0,0.0,0.0,0.0
