# OCTIS Model Hyperparameter Optimization

In this notebook, we will pick out the best hyperparameters for the OCTIS models we will use for our final report. The optimization will be done using OCTIS' built-in optimizer, with a custom wrapper made for the purpsoe of this notebook.

## Imports & Setup

In [None]:
from utils.model_optimizer import OCTISModelOptimizer
from config.optimization import OPTIMIZATION_RESULT_PATH, TOP_K, NUM_PROCESSES, MODEL_RUNS, search_space

from octis.models.LSI import LSI
from octis.models.NMF import NMF
from octis.models.LDA import LDA
from octis.models.HDP import HDP
from octis.models.NeuralLDA import NeuralLDA
from octis.models.ProdLDA import ProdLDA

from octis.evaluation_metrics.coherence_metrics import Coherence
from octis.evaluation_metrics.diversity_metrics import TopicDiversity
from octis.evaluation_metrics.similarity_metrics import RBO, PairwiseJaccardSimilarity

from octis.dataset.dataset import Dataset

In [None]:
try:
    dataset = Dataset()
    dataset.load_custom_dataset_from_folder('data/dataset')
    corpus = dataset.get_corpus()
    print("Dataset found cached - loading...")
except:
    print("Dataset not found. Please load the dataset first.")

### Evaluation Metrics

In [None]:
coherence_npmi = Coherence(texts=corpus, topk=TOP_K, processes=NUM_PROCESSES, measure='c_npmi')
coherence_cv = Coherence(texts=corpus, topk=TOP_K, processes=NUM_PROCESSES, measure='c_v')
coherence_umass = Coherence(texts=corpus, topk=TOP_K, processes=NUM_PROCESSES, measure='u_mass')
coherence_uci = Coherence(texts=corpus, topk=TOP_K, processes=NUM_PROCESSES, measure='c_uci')

diversity_topic = TopicDiversity(topk=TOP_K)

similarity_rbo = RBO(topk=TOP_K)
similarity_pjs = PairwiseJaccardSimilarity()

other_metrics = [coherence_npmi, coherence_umass, coherence_uci, diversity_topic, similarity_rbo, similarity_pjs]

### Optimization Parameters

In [None]:
optimization_params = {
     "validation_metric": coherence_cv,
     "other_metrics": other_metrics, 
     "topk": TOP_K,
     "model_runs": MODEL_RUNS,
}

## Model Initialization

We will initialize the models we will use for the optimization process.

In [None]:
lsi_model = LSI()
lda_model = LDA()
hdp_model = HDP()
nmf_model = NMF(normalize=True)
neural_lda_model = NeuralLDA(use_partitions=False)
prod_lda_model = ProdLDA(use_partitions=False)

## Model Optimization

### Latent Semantic Indexing (LSI)

In [None]:
lsi_optimizer = OCTISModelOptimizer(model=lsi_model, 
                                    dataset=dataset,
                                    search_space=search_space['lsi'],
                                    save_path=OPTIMIZATION_RESULT_PATH + 'lsi/',
                                    optimization_runs=(len(search_space['lsi'].values()) - 2) * 15,
                                    **optimization_params)

lsi_optimal_params = lsi_optimizer.optimize()
lsi_optimal_params

### Latent Dirichlet Allocation (LDA)

In [None]:
lda_optimizer = OCTISModelOptimizer(model=lda_model, 
                                    dataset=dataset, 
                                    search_space=search_space["lda"], 
                                    save_path=OPTIMIZATION_RESULT_PATH + "lda/", 
                                    optimization_runs=(len(search_space["lda"].values()) - 2) * 15,
                                    **optimization_params)

lda_optimal_params = lda_optimizer.optimize()
lda_optimal_params

### Hierarchical Dirichlet Process (HDP)

In [None]:
hdp_optimizer = OCTISModelOptimizer(model=hdp_model,
                                    dataset=dataset,
                                    search_space=search_space["hdp"],
                                    save_path=OPTIMIZATION_RESULT_PATH + "hdp/",
                                    optimization_runs=(len(search_space["hdp"].values()) - 1) * 15,
                                    **optimization_params)

hdp_optimal_params = hdp_optimizer.optimize()
hdp_optimal_params

### Non-Negative Matrix Factorization (NMF)

In [None]:
nmf_optimizer = OCTISModelOptimizer(model=nmf_model,
                                    dataset=dataset,
                                    search_space=search_space["nmf"],
                                    save_path=OPTIMIZATION_RESULT_PATH + "nmf/",
                                    optimization_runs=(len(search_space["nmf"].values()) - 2) * 15,
                                    **optimization_params)

nmf_optimal_params = nmf_optimizer.optimize()
nmf_optimal_params

### Neural LDA 

In [None]:
neural_lda_optimizer = OCTISModelOptimizer(model=neural_lda_model,
                                             dataset=dataset,
                                             search_space=search_space["neural_lda"],
                                             save_path=OPTIMIZATION_RESULT_PATH + "neural_lda/",
                                             optimization_runs=(len(search_space["neural_lda"].values()) - 2) * 15,
                                             **optimization_params)

neural_lda_optimal_params = neural_lda_optimizer.optimize()
neural_lda_optimal_params

### ProdLDA

In [None]:
prod_lda_optimizer = OCTISModelOptimizer(model=prod_lda_model,
                                            dataset=dataset,
                                            search_space=search_space["prod_lda"],
                                            save_path=OPTIMIZATION_RESULT_PATH + "prod_lda/",
                                            optimization_runs=(len(search_space["prod_lda"].values()) - 2) * 15,
                                            **optimization_params)

prod_lda_optimal_params = prod_lda_optimizer.optimize()
prod_lda_optimal_params