In [8]:
import pandas as pd
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from nltk.tokenize import sent_tokenize
from models.bertopic.utils.data_loader import DataLoader
from sklearn.feature_extraction.text import CountVectorizer

from spacy.lang.el.stop_words import STOP_WORDS as el_stop
from spacy.lang.en.stop_words import STOP_WORDS as en_stop

from octis.evaluation_metrics.coherence_metrics import Coherence
from octis.evaluation_metrics.diversity_metrics import TopicDiversity, KLDivergence
from octis.evaluation_metrics.similarity_metrics import RBO, PairwiseJaccardSimilarity
from octis.evaluation_metrics.topic_significance_metrics import KL_uniform

from models.bertopic.utils.bertopic_evaluator import BERTopicModelEvaluator

## Data, embeddings and sentence transformers loading

In [12]:
# TODO: Add embeddings for multilingual sentence transformer

In [2]:
loader = DataLoader('data/data_speeches.csv', 'data/data_statements.csv')
loader.process()

train_docs, train_sentences = loader.get_train_data()

docs_corpus = [[doc] for doc in train_docs]
sentences_corpus = [[sentence] for sentence in train_sentences]

# embeddings_gr = loader.load_embeddings("models/bertopic/data/checkpoint/embeddings/embeddings_gr.pkl")
# embeddings_gr_media = loader.load_embeddings("models/bertopic/data/checkpoint/embeddings/embeddings_gr_media.pkl")
# embeddings_gr_sentences = loader.load_embeddings("models/bertopic/data/checkpoint/embeddings/embeddings_gr_sentences.pkl")
# embeddings_gr_media_sentences = loader.load_embeddings("models/bertopic/data/checkpoint/embeddings/embeddings_gr_media_sentences.pkl")

# st_greek = SentenceTransformer('lighteternal/stsb-xlm-r-greek-transfer')
# st_greek_media = SentenceTransformer('dimitriz/st-greek-media-bert-base-uncased')

## Vectorizer

In [4]:
stopwords = list(set(el_stop).union(set(en_stop)))
vectorizer_model = CountVectorizer(stop_words=stopwords, ngram_range=(1, 2))

## Evaluation metrics

In [None]:
coherence_metrics = {
    'docs': {
        "coherence_npmi": Coherence(texts=docs_corpus, topk=5, processes=1, measure='c_npmi'),
        "coherence_cv": Coherence(texts=docs_corpus, topk=5, processes=1, measure='c_v'),
        "coherence_umass": Coherence(texts=docs_corpus, topk=5, processes=1, measure='u_mass'),
        "coherence_uci": Coherence(texts=docs_corpus, topk=5, processes=1, measure='c_uci')
    },
    'sentences': {
        "coherence_npmi": Coherence(texts=sentences_corpus, topk=5, processes=1, measure='c_npmi'),
        "coherence_cv": Coherence(texts=sentences_corpus, topk=5, processes=1, measure='c_v'),
        "coherence_umass": Coherence(texts=sentences_corpus, topk=5, processes=1, measure='u_mass'),
        "coherence_uci": Coherence(texts=sentences_corpus, topk=5, processes=1, measure='c_uci')
    }
}

other_metrics = {
    'diversity_topic': TopicDiversity(topk=5),
    'diversity_kl': KLDivergence(),
    'similarity_rbo': RBO(topk=5),
    'similarity_pjs': PairwiseJaccardSimilarity(),
    'significance_kluni': KL_uniform()
}

all_metrics = {
    'coherence_metrics': coherence_metrics,
    'other_metrics': other_metrics}

## BERTopic model initialization

In [15]:
model_gr_docs = BERTopic(embedding_model=st_models['gr_stsb'], 
                         vectorizer_model=vectorizer_model, 
                         nr_topics=30)

model_gr_media_docs = BERTopic(embedding_model=st_models['gr_media'], 
                               vectorizer_model=vectorizer_model,
                               nr_topics=30)

model_gr_sentences = BERTopic(embedding_model=st_model['gr_stsb'], 
                              vectorizer_model=vectorizer_model,
                              nr_topics=30)

model_gr_media_sentences = BERTopic(embedding_model=st_models['gr_media'], 
                                    vectorizer_model=vectorizer_model,
                                    nr_topics=30)

model_multilingual_docs = BERTopic(embedding_model=st_models['multilingual'],
                              vectorizer_model=vectorizer_model,
                              nr_topics=30)

model_multilingual_sentences = BERTopic(embedding_model=st_models['multilingual'],
                                vectorizer_model=vectorizer_model,
                                nr_topics=30)

In [None]:
models = {
    'gr_docs': model_gr_docs,
    'gr_media_docs': model_gr_media_docs,
    'gr_sentences': model_gr_sentences,
    'gr_media_sentences': model_gr_media_sentences,
    'multilingual_docs': model_multilingual_docs,
    'multilingual_sentences': model_multilingual_sentences
}

In [None]:
datasets = {
    'docs': train_docs,
    'sentences': train_sentences
}

In [None]:
evaluator = BERTopicModelEvaluator(models, all_metrics, datasets)