# OCTIS Model Evaluation

## Prologue & Imports

We will evaluate the performance of most relevant OCTIS models as a baseline for non-SOTA Topic Modeling. These models will be compared on the same preprocessed dataset, the same number of topics and the same evaluation metrics.

In [10]:
from octis.models.LSI import LSI
from octis.models.NMF import NMF
from octis.models.LDA import LDA
from octis.models.HDP import HDP
from octis.models.NeuralLDA import NeuralLDA
from octis.models.ProdLDA import ProdLDA
from octis.dataset.dataset import Dataset
from octis.evaluation_metrics.coherence_metrics import Coherence
from octis.evaluation_metrics.diversity_metrics import TopicDiversity, KLDivergence
from octis.evaluation_metrics.similarity_metrics import RBO, PairwiseJaccardSimilarity
from octis.evaluation_metrics.topic_significance_metrics import KL_uniform

from spacy.lang.el.stop_words import STOP_WORDS as el_stop
from spacy.lang.en.stop_words import STOP_WORDS as en_stop

from utils.data_loader import GreekPMDataloader
from models.octis.utils.preprocessor_gr import GreekStanzaPreprocessor
from models.octis.config.preprocessing import preprocessor_gr_params
from models.octis.config.models import NUM_TOPICS, lsi_params, nmf_params, lda_params, hdp_params, neural_lda_params, prod_lda_params
from models.octis.config.optimization import OPTIMIZATION_RESULT_PATH, TOP_K, NUM_PROCESSES, MODEL_RUNS, search_space
from models.octis.utils.model_evaluator import OCTISModelEvaluator

import pandas as pd

## Dataset Loading

If our dataset has already been processed and cached, then we can load it. Otherwise, we will preprocess it and save it for future use.

In [11]:
try:
    dataset = Dataset()
    dataset.load_custom_dataset_from_folder('models/octis/data/dataset')
    print("Dataset found cached - loading...")
except:
    print("Dataset not found in cache - loading...")
    # Merge data and prepare for preprocessing
    try:
        speeches_df = pd.read_csv('data/data_speeches.csv')
        statements_df = pd.read_csv('data/data_statements.csv')
    except: 
        print("GreekPM data not found - fetching...")
        ds = GreekPMDataloader() # If the data is not available, download it
        cats_df = ds.load_categories("speeches", "statements")
        print("GreekPM data fetched!")

    df = pd.concat([speeches_df, statements_df], ignore_index=True)
    
    # Drop irrelevant columns and convert to string
    df['text'] = df['text'].astype(str)
    df = df.drop(columns=['date', 'id', 'url', 'title']).dropna(how='any')
    
    df.to_csv('data/data_merged.csv', index=False)

    # We have some non-Greek stopwords in the dataset, so we need to remove them
    stopwords = set(el_stop).union(set(en_stop))
    
    # Initialize preprocessing
    preprocessor = GreekStanzaPreprocessor(
                             stopword_list=stopwords, 
                             **preprocessor_gr_params)
    
    # Create the dataset
    print("Preprocessing data...")
    dataset = preprocessor.preprocess_dataset(documents_path='data/data_merged.csv')
    
    dataset.save('models/octis/data/dataset/')
    print("Dataset preprocessed and saved!")

Dataset not found in cache - loading...
Preprocessing data...


  0%|          | 0/2033 [00:00<?, ?it/s]

2024-04-04 18:07:17 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES
2024-04-04 18:07:17 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES
2024-04-04 18:07:17 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES
2024-04-04 18:07:17 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES
2024-04-04 18:07:17 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with d

Dataset preprocessed and saved!


In [12]:
corpus = dataset.get_corpus()

## Evaluation Metrics

In [13]:
coherence_npmi = Coherence(texts=corpus, topk=TOP_K, processes=NUM_PROCESSES, measure='c_npmi')
coherence_cv = Coherence(texts=corpus, topk=TOP_K, processes=NUM_PROCESSES, measure='c_v')
coherence_umass = Coherence(texts=corpus, topk=TOP_K, processes=NUM_PROCESSES, measure='u_mass')
coherence_uci = Coherence(texts=corpus, topk=TOP_K, processes=NUM_PROCESSES, measure='c_uci')

diversity_topic = TopicDiversity(topk=TOP_K)
diversity_kl = KLDivergence()

similarity_rbo = RBO(topk=TOP_K)
similarity_pjs = PairwiseJaccardSimilarity()

significance_kluni = KL_uniform()

other_metrics = [coherence_npmi, coherence_umass, coherence_uci, diversity_topic, diversity_kl, similarity_rbo, similarity_pjs, significance_kluni]

In [14]:
metrics = {"coherence_npmi": coherence_npmi, "coherence_cv": coherence_cv, "coherence_umass": coherence_umass, "coherence_uci": coherence_uci, "diversity_topic": diversity_topic, "diversity_kl": diversity_kl, "similarity_rbo": similarity_rbo, "similarity_pjs": similarity_pjs, "significance_kluni": significance_kluni}

## Model Initialization

In [15]:
lsi_model = LSI(**lsi_params)
lda_model = LDA(**lda_params)
hdp_model = HDP(**hdp_params)
nmf_model = NMF(**nmf_params)
neural_lda_model = NeuralLDA(**neural_lda_params)
prod_lda_model = ProdLDA(**prod_lda_params)

In [16]:
models = {"lsi": lsi_model, "lda": lda_model, "hdp": hdp_model, "nmf": nmf_model, "neural_lda": neural_lda_model, "prod_lda": prod_lda_model}

## Evaluation

In [17]:
evaluator = OCTISModelEvaluator(dataset=dataset, 
                                models=models,
                                metrics=metrics,
                                topics=NUM_TOPICS,
                            )

In [18]:
evaluator.evaluate()

Epoch: [1/200]	Samples: [1439/287800]	Train Loss: 3476.8416109277277	Time: 0:00:00.148420
Epoch: [1/200]	Samples: [160/32000]	Validation Loss: 150187.7669921875	Time: 0:00:00.007389
Epoch: [2/200]	Samples: [2878/287800]	Train Loss: 3374.325817625087	Time: 0:00:00.116625
Epoch: [2/200]	Samples: [160/32000]	Validation Loss: 44377.08774414063	Time: 0:00:00.005676
Epoch: [3/200]	Samples: [4317/287800]	Train Loss: 3315.628974113968	Time: 0:00:00.112942
Epoch: [3/200]	Samples: [160/32000]	Validation Loss: 8872.72900390625	Time: 0:00:00.005109
Epoch: [4/200]	Samples: [5756/287800]	Train Loss: 3276.972148627519	Time: 0:00:00.113632
Epoch: [4/200]	Samples: [160/32000]	Validation Loss: 6392.42294921875	Time: 0:00:00.004949
Epoch: [5/200]	Samples: [7195/287800]	Train Loss: 3245.7185111188323	Time: 0:00:00.114413
Epoch: [5/200]	Samples: [160/32000]	Validation Loss: 3174.653564453125	Time: 0:00:00.005791
Epoch: [6/200]	Samples: [8634/287800]	Train Loss: 3223.8136075399584	Time: 0:00:00.108781
Epoch

  self.evaluation_df = pd.concat([self.evaluation_df, pd.DataFrame(model_metric_data)], ignore_index=True)
  divergence = np.sum(P*np.log(P/Q))
  divergence = np.sum(P*np.log(P/Q))


Unnamed: 0,model,coherence_npmi,coherence_cv,coherence_umass,coherence_uci,diversity_topic,diversity_kl,similarity_rbo,similarity_pjs,significance_kluni
0,lsi,0.032779,0.592699,-1.345714,-0.752395,0.62,0.383475,0.045157,0.031863,0.189856
1,lda,0.144816,0.700904,-1.207785,0.662991,0.793333,2.276325,0.014908,0.011208,1.599033
2,hdp,-0.034733,0.518642,-2.05837,-2.333349,0.568,0.358759,0.015574,0.013512,0.213824
3,nmf,0.113603,0.649522,-1.623093,-0.148922,0.56,3.95566,0.042739,0.035983,2.014337
4,neural_lda,0.013674,0.552173,-1.467874,-0.620283,0.98,1.338393,0.00155,0.001089,0.835836
5,prod_lda,-0.104389,0.542047,-2.846263,-4.63299,0.94,,0.003256,0.003846,
