# OCTIS Model Evaluation

## Prologue & Imports

We will evaluate the performance of most relevant OCTIS models as a baseline for non-SOTA Topic Modeling. These models will be compared on the same preprocessed dataset, the same number of topics and the same evaluation metrics.

In [2]:
from octis.models import LSI, NMF, LDA, HDP, NeuralLDA, ProdLDA, ETM
from octis.dataset.dataset import Dataset
from octis.evaluation_metrics.diversity_metrics import TopicDiversity
from octis.evaluation_metrics.coherence_metrics import Coherence

from spacy.lang.el.stop_words import STOP_WORDS as el_stop
from spacy.lang.en.stop_words import STOP_WORDS as en_stop

from utils.data_loader import GreekPMDataloader
from models.octis.utils.preprocessor_gr import GreekStanzaPreprocessor
from models.octis.config.preprocessing import preprocessor_gr_params
from models.octis.config.models import NUM_TOPICS, lsi_params, nmf_params, lda_params, hdp_params, neural_lda_params, prod_lda_params, etm_params

import pandas as pd

2024-04-03 15:27:25 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json:   0%|   …

2024-04-03 15:27:25 INFO: Downloaded file to /Users/dion/stanza_resources/resources.json
2024-04-03 15:27:25 INFO: Loading these models for language: el (Greek):
| Processor | Package                 |
---------------------------------------
| tokenize  | gdt                     |
| mwt       | gdt                     |
| pos       | models/oct..._tagger.pt |
| lemma     | models/oct...matizer.pt |

2024-04-03 15:27:25 INFO: Using device: cpu
2024-04-03 15:27:25 INFO: Loading: tokenize
2024-04-03 15:27:26 INFO: Loading: mwt
2024-04-03 15:27:26 INFO: Loading: pos
2024-04-03 15:27:26 INFO: Loading: lemma
2024-04-03 15:27:26 INFO: Done loading processors!


## Dataset Loading

If our dataset has already been processed and cached, then we can load it. Otherwise, we will preprocess it and save it for future use.

In [4]:
try:
    dataset = Dataset()
    dataset.load_custom_dataset_from_folder('models/octis/data/dataset')
    print("Dataset found cached - loading...")
except:
    print("Dataset not found in cache - loading...")
    # Merge data and prepare for preprocessing
    try:
        speeches_df = pd.read_csv('data/data_speeches.csv')
        statements_df = pd.read_csv('data/data_statements.csv')
    except: 
        print("GreekPM data not found - fetching...")
        ds = GreekPMDataloader() # If the data is not available, download it
        cats_df = ds.load_categories("speeches", "statements")
        print("GreekPM data fetched!")

    df = pd.concat([speeches_df, statements_df], ignore_index=True)
    
    # Drop irrelevant columns and convert to string
    df['text'] = df['text'].astype(str)
    df = df.drop(columns=['date', 'id', 'url', 'title']).dropna(how='any')
    
    df.to_csv('data/data_merged.csv', index=False)

    # We have some non-Greek stopwords in the dataset, so we need to remove them
    stopwords = set(el_stop).union(set(en_stop))
    
    # Initialize preprocessing
    preprocessor = GreekStanzaPreprocessor(
                             stopword_list=stopwords, 
                             **preprocessor_gr_params)
    
    # Create the dataset
    print("Preprocessing data...")
    dataset = preprocessor.preprocess_dataset(documents_path='data/data_merged.csv')
    
    dataset.save('models/octis/data/dataset/')
    print("Dataset preprocessed and saved!")

Dataset found cached - loading...


## Model Initialization

In [None]:
# coherence_npmi = Coherence(texts=dataset.get_corpus(), topk=10, measure='c_npmi')
# coherence_cv = Coherence(texts=dataset.get_corpus(), topk=10, measure='c_v')
# coherence_umass = Coherence(texts=dataset.get_corpus(), topk=10, measure='u_mass')
# coherence_uci = Coherence(texts=dataset.get_corpus(), topk=10, measure='c_uci')
# topic_diversity = TopicDiversity(topk=5)

In [6]:
lsi_model = LSI.LSI(num_topics=NUM_TOPICS, **lsi_params)
# lda_model = LDA(num_topics=NUM_TOPICS, dataset=dataset, **lda_params)
# hdp_model = HDP(num_topics=NUM_TOPICS, dataset=dataset, **hdp_params)
# nmf_model = NMF(num_topics=NUM_TOPICS, dataset=dataset, **nmf_params)
# neural_lda_model = NeuralLDA(num_topics=NUM_TOPICS, dataset=dataset, **neural_lda_params)
# prod_lda_model = ProdLDA(num_topics=NUM_TOPICS, dataset=dataset, **prod_lda_params)
# etm_model = ETM(num_topics=NUM_TOPICS, dataset=dataset, **etm_params)

In [None]:
for t in output['topics']:
  print(" ".join(t))

In [None]:
npmi = Coherence(texts=dataset.get_corpus(), topk=5, measure='c_npmi')
cv = Coherence(texts=dataset.get_corpus(), topk=5, measure='c_v')
umass = Coherence(texts=dataset.get_corpus(), topk=5, measure='u_mass')
uci = Coherence(texts=dataset.get_corpus(), topk=5, measure='c_uci')

In [None]:
topic_diversity = TopicDiversity(topk=5)

In [None]:
topic_diversity_score = topic_diversity.score(output)
print("Topic diversity: "+str(topic_diversity_score))

npmi_score = npmi.score(output)
print("NPMI Coherence: "+str(npmi_score)) 

cv_score = cv.score(output)
print("C_V Coherence: "+str(cv_score))

umass_score = umass.score(output)
print("U_MASS Coherence: "+str(umass_score))

uci_score = uci.score(output)
print("C_UCI Coherence: "+str(uci_score))