# Download the repo from: https://github.com/AdhyaSuman/CTM-Neg and run the setup file to install OCTIS

In [None]:
cd ..

Run the setup.py file using pip

In [None]:
!pip install -e.

# Imports

In [None]:
from octis.dataset.dataset import Dataset

#Import models:
from octis.models.LDA import LDA
from octis.models.ETM import ETM
from octis.models.CTM import CTM
from octis.models.CTMN import CTMN

#Import coherence metrics:
from octis.evaluation_metrics.coherence_metrics import *

#Import TD metrics:
from octis.evaluation_metrics.diversity_metrics import *

import random, torch

# Utils

In [None]:
data_dir = './preprocessed_datasets'


def get_dataset(dataset_name):
    data = Dataset()
    if dataset_name=='20NG':
        data.fetch_dataset("20NewsGroup")
    elif dataset_name=='GN':
        data.load_custom_dataset_from_folder(data_dir + "/GN")
    elif dataset_name == 'M10':
        data.load_custom_dataset_from_folder(data_dir + '/M10')
    else:
        raise Exception('Missing Dataset name...!!!')
    return data


def get_model(model_name, num_topics, dataset_name, use_partitions=False, epochs=50):
    if model_name=='ETM':
        model = ETM(num_topics=num_topics, num_epochs=epochs, 
                    train_embeddings=True, use_partitions=use_partitions)
        
    elif model_name=='CTM_combined':
        model = CTM(num_topics=num_topics, num_epochs=epochs,
                    bert_path='./bert/ctm_combined/paraphrase-distilroberta-base-v2_'+dataset_name,
                    bert_model='paraphrase-distilroberta-base-v2',
                    learn_priors=True,
                    inference_type="combined", use_partitions=use_partitions)
        
    elif model_name=='ProdLDA':
        model = ProdLDA(num_topics=num_topics, use_partitions=use_partitions, num_epochs=epochs)
        
    elif model_name=='NeuralLDA':
        model = NeuralLDA(num_topics=num_topics, use_partitions=use_partitions, num_epochs=epochs)
    
    elif model_name=='LDA':
        model = LDA(num_topics=num_topics, use_partitions=use_partitions, iterations=epochs)
    
    elif model_name=='NegCTM':
        model = CTMN(num_topics=num_topics, hidden_sizes=(100, 100),
                      dropout=0.0,
                      num_epochs=epochs,
                      bert_path='./bert/ctm_combined/paraphrase-distilroberta-base-v2_'+dataset_name,
                      bert_model='paraphrase-distilroberta-base-v2',
                      learn_priors=True,
                      inference_type="combined", use_partitions=use_partitions)
    else:
        raise Exception('Model {} is not avilable'.format(model_name))
        
    return model

# Run

In [None]:
from octis.optimization.optimizer import Optimizer
from skopt.space.space import Real, Categorical, Integer

search_space = {"topic_perturb": Categorical({1, 2, 3}), 
                "tloss_weight": Real(0.0, 1.0)
}
optimization_runs= 50
model_runs= 5

n_topics = [10, 20, 30, 40, 50, 60, 90, 120]
models = ['NegCTM'] 
datasets = ['20NG', 'GN', 'M10']

irbo = InvertedRBO(topk=10)

for d in datasets:
    data = get_dataset(d)
    
    test=data.get_corpus()
    
    eval_metric = Coherence(texts=test, measure='c_npmi',topk=10)
    cv = Coherence(texts=test, measure='c_v', topk=10)
    
    for m in models:
        for k in n_topics:
            model = get_model(model_name=m, num_topics=k, dataset_name=d)
            
            optimizer=Optimizer()
            
            optimization_result = optimizer.optimize(
                model, data, eval_metric, search_space, number_of_call=optimization_runs, 
                model_runs=model_runs, save_models=False, 
                extra_metrics=[cv, td, irbo], # to keep track of other metrics
                save_name="results_{}_{}_topics_{}".format(m,d,k),
                save_path='./results/Hyp_optimized/')
            
            optimization_result.save_to_csv("./results/Hyp_optimized/results_{}_{}_topics_{}.csv".format(m,d,k))