# Download the repo from https://github.com/AdhyaSuman/NTMs_Dropout_Analysis and run the setup file to install OCTIS

In [None]:
cd ..

Run the setup.py file using pip

In [None]:
pip install -e.

# Imports

In [None]:
from octis.dataset.dataset import Dataset

#Import models:
from octis.models.ETM import ETM
from octis.models.CTM import CTM
from octis.models.ProdLDA import ProdLDA

#Import coherence metric:
from octis.evaluation_metrics.coherence_metrics import Coherence

#Import TD metrics:
from octis.evaluation_metrics.diversity_metrics import TopicDiversity, InvertedRBO

#Import Classfication metric:
from octis.evaluation_metrics.classification_metrics import AccuracyScore

import random, torch

# Utils

In [None]:
data_dir = './preprocessed_datasets'

def get_dataset(dataset_name):
    data = Dataset()
    if dataset_name=='20NG':
        data.fetch_dataset("20NewsGroup")
        
    elif dataset_name=='BBC_news':
        data.load_custom_dataset_from_folder(data_dir + "/BBC_news")
        
    elif dataset_name=='Wiki40B':
        data.load_custom_dataset_from_folder(data_dir + "/Wiki40B")
        
    elif dataset_name=='AllNews':
        data.load_custom_dataset_from_folder(data_dir + "/AllNews")
    else:
        raise Exception('Missing Dataset name...!!!')
    return data


def get_model(model_name, num_topics, dataset_name, use_partitions, theta_dropout, enc_dropout, num_epochs=30):
    
    if model_name=='ETM':
        model = ETM(num_topics=num_topics,
                    num_epochs=num_epochs,
                    theta_dropout=theta_dropout,
                    enc_dropout=enc_dropout,
                    train_embeddings=True,
                    use_partitions=use_partitions)
        
    elif model_name=='CTM_combined':
        model = CTM(num_topics=num_topics,
                    num_epochs=num_epochs,
                    theta_dropout=theta_dropout,
                    enc_dropout=enc_dropout,
                    bert_path='./bert/{}/'.format(dataset_name),
                    bert_model='paraphrase-distilroberta-base-v2',
                    inference_type="combined",
                    use_partitions=use_partitions)
    
    elif model_name=='ProdLDA':
        model = ProdLDA(num_topics=num_topics,
                        use_partitions=use_partitions,
                        num_epochs=num_epochs,
                        theta_dropout=theta_dropout,
                        enc_dropout=enc_dropout,)
    
    else:
        raise Exception('Missing Model name...!!!')
        
    return model

# Run Models

In [None]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

from random import seed, randint
from IPython.display import clear_output

seeds = [randint(0, 2e3) for _ in range(10)]
n_topics = [20, 50, 100]
models = ['CTM_combined', 'ProdLDA', 'ETM']
datasets = ['20NG', 'BBC_news', 'Wiki40B', 'AllNews']
use_partitions = [True]
theta_dropout = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6]
enc_dropout = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6]

results = {
    'Dataset': [],
    'Seed': [],
    'K': [],
    'Partition': [],
    'ThetaDrop': [],
    'EncodingDrop': [],
    'Model':[],
    'NPMI': [],
    'TD': [],
    'Accuracy':[]
}


td = TopicDiversity(topk=10)
irbo = InvertedRBO(topk=10)

for m in models:
    for k in n_topics:
        for seed in seeds:
            for partition in use_partitions:
                for e_drop in enc_dropout:
                    for t_drop in theta_dropout:
                        for d in datasets:
                            data = get_dataset(d)

                            print("-"*100)
                            print('Dataset:{}, Model:{}, K={}, seed={}, partition={}, enc_dropout={}, theta_dropout={}'.format(d, m, k, seed, partition, e_drop, t_drop))
                            print("-"*100)

                            random.seed(seed)
                            torch.random.manual_seed(seed)

                            model = get_model(model_name=m,
                                              num_topics=k,
                                              dataset_name=d,
                                              use_partitions=partition,
                                              enc_dropout=e_drop,
                                              theta_dropout=t_drop,
                                             )

                            output = model.train_model(dataset=data)
                            del model
                            torch.cuda.empty_cache()
                            #Hyperparams:
                            results['Dataset'].append(d)
                            results['Model'].append(m)
                            results['K'].append(k)
                            results['Seed'].append(seed)
                            results['Partition'].append(partition)
                            results['EncodingDrop'].append(e_drop)
                            results['ThetaDrop'].append(t_drop)
                            #############
                            if partition==True:
                                #classification:
                                #Accuracy
                                try:                                    
                                    accuracy = AccuracyScore(data)
                                    results['Accuracy'].append(accuracy.score(output))
                                except:
                                    results['Accuracy'].append(0.0)
                            #############
                            #Coherence Scores:
                            npmi = Coherence(texts=data.get_corpus(), measure='c_npmi')
                            results['NPMI'].append(npmi.score(output))
                            del npmi
                            #############
                            #Topic Diversities:
                            results['TD'].append(td.score(output))                            
                            clear_output(wait=False)


In [None]:
print(results)