# Download the repo from https://github.com/AdhyaSuman/CTMNeg and run the setup file to install OCTIS

In [None]:
cd ..

Run the setup.py file using pip

In [None]:
!pip install -e.

In [None]:
#Import models:
from octis.models.LDA import LDA
from octis.models.ETM import ETM
from octis.models.CTM import CTM
from octis.models.CTMN import CTMN
from octis.models.ProdLDA import ProdLDA

#Import coherence metrics:
from octis.evaluation_metrics.coherence_metrics import *

#Import TD metrics:
from octis.evaluation_metrics.diversity_metrics import *

import random, torch

# Utils

In [None]:
data_dir = './preprocessed_datasets'


def get_dataset(dataset_name):
    data = Dataset()
    if dataset_name=='20NG':
        data.fetch_dataset("20NewsGroup")
    elif dataset_name=='GN':
        data.load_custom_dataset_from_folder(data_dir + "/GN")
    elif dataset_name == 'M10':
        data.load_custom_dataset_from_folder(data_dir + '/M10')
    else:
        raise Exception('Missing Dataset name...!!!')
    return data


def get_model(model_name, num_topics, dataset_name, use_partitions=False, epochs=50,
              sbert='paraphrase-distilroberta-base-v2'):
    if model_name=='ETM':
        model = ETM(num_topics=num_topics, num_epochs=epochs, 
                    train_embeddings=True, use_partitions=use_partitions)
        
    elif model_name=='CTM_combined':
        model = CTM(num_topics=num_topics, num_epochs=epochs,
                      bert_path='./bert/{}_{}_{}_partition{}'.format(model_name, dataset_name, sbert, use_partitions),
                    bert_model=sbert,
                    learn_priors=True,
                    inference_type="combined", use_partitions=use_partitions)
        
    elif model_name=='ProdLDA':
        model = ProdLDA(num_topics=num_topics, use_partitions=use_partitions, num_epochs=epochs)
        
    elif model_name=='NeuralLDA':
        model = NeuralLDA(num_topics=num_topics, use_partitions=use_partitions, num_epochs=epochs)
    
    elif model_name=='LDA':
        model = LDA(num_topics=num_topics, use_partitions=use_partitions, iterations=epochs)
    
    elif model_name=='NegCTM':
        model = CTMN(num_topics=num_topics, hidden_sizes=(100, 100),
                      dropout=0.0,
                      num_epochs=epochs,
                      bert_path='./bert/{}_{}_{}_partition{}'.format(model_name, dataset_name, sbert, use_partitions),
                      bert_model=sbert,
                      learn_priors=True,
                      inference_type="combined", use_partitions=use_partitions)
    else:
        raise Exception('Model {} is not avilable'.format(model_name))
        
    return model

# Run

In [None]:
from random import seed, randint

seeds = [randint(0, 2e3) for _ in range(5)]
n_topics = [10, 20, 30, 40, 50, 60, 90, 120]
models = ['CTM_combined', 'ProdLDA', 'ETM', 'LDA']
datasets = ['20NG', 'GN', 'M10']

results = {
    'Dataset': [],
    'Seed': [],
    'K': [],
    'Model':[],
    'NPMI': [],
    'CV': [],
    'IRBO': []
}

irbo = InvertedRBO(topk=10)

for m in models:
    for k in n_topics:
        for seed in seeds:
            for d in datasets:
                data = get_dataset(d)

                print("-"*100)
                print('Dataset:{}, Model:{}, K={}, seed={}'.format(d, m, k, seed))
                print("-"*100)

                random.seed(seed)
                torch.random.manual_seed(seed)

                model = get_model(model_name=m,
                                  num_topics=k,
                                  dataset_name=d,
                                  use_partitions=False,
                                  epochs=50
                                 )
                hyp = None
                if m=='LDA':
                    hyp = {'passes':5, 'random_state':seed}
                    
                output = model.train_model(data, hyp)
                
                del model
                torch.cuda.empty_cache()

                #Hyperparams:
                results['Dataset'].append(d)
                results['Model'].append(m)
                results['K'].append(k)
                results['Seed'].append(seed)
                #############
                #Coherence Scores:
                npmi = Coherence(texts=data.get_corpus(), measure='c_npmi')
                results['NPMI'].append(npmi.score(output))
                del npmi

                cv = Coherence(texts=data.get_corpus(), measure='c_v')
                results['CV'].append(cv.score(output))
                del cv
                #############
                #Topic Diversity:
                results['IRBO'].append(irbo.score(output))
                print(results)

# Save results as CSV

In [None]:
import pandas as pd
Results = pd.DataFrame.from_dict(results)
Results.to_csv('./results/comparision.csv', index=False)