In [1]:
import pandas as pd
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from nltk.tokenize import sent_tokenize
from models.bertopic.utils.data_loader import DataLoader
from sklearn.feature_extraction.text import CountVectorizer

from spacy.lang.el.stop_words import STOP_WORDS as el_stop
from spacy.lang.en.stop_words import STOP_WORDS as en_stop

from octis.evaluation_metrics.coherence_metrics import Coherence
from octis.evaluation_metrics.diversity_metrics import TopicDiversity, KLDivergence
from octis.evaluation_metrics.similarity_metrics import RBO, PairwiseJaccardSimilarity
from octis.evaluation_metrics.topic_significance_metrics import KL_uniform

## Data, embeddings and sentence transformers loading

In [12]:
# TODO: Add embeddings for multilingual sentence transformer

In [22]:
loader = DataLoader('data/data_speeches.csv', 'data/data_statements.csv')
loader.process()

docs = loader.get_docs()
sentences = loader.get_sentences()

docs_corpus = [[doc] for doc in docs]
sentences_corpus = [[sentence] for sentence in sentences]

embeddings_gr = loader.load_embeddings("models/bertopic/data/checkpoint/embeddings/embeddings_gr.pkl")
embeddings_gr_media = loader.load_embeddings("models/bertopic/data/checkpoint/embeddings/embeddings_gr_media.pkl")
embeddings_gr_sentences = loader.load_embeddings("models/bertopic/data/checkpoint/embeddings/embeddings_gr_sentences.pkl")
embeddings_gr_media_sentences = loader.load_embeddings("models/bertopic/data/checkpoint/embeddings/embeddings_gr_media_sentences.pkl")

st_greek = SentenceTransformer('lighteternal/stsb-xlm-r-greek-transfer')
st_greek_media = SentenceTransformer('dimitriz/st-greek-media-bert-base-uncased')

In [21]:
print(sentences[:2])

['Κύριε Πρόεδρε κυρίες και κύριοι βουλευτές Προσέρχομαι στη σημερινή συζήτηση θεωρώντας την τραγωδία των Τεμπών ένα ζήτημα πολύ μεγαλύτερο πολύ βαθύτερο από απλή αφορμή για την κατάθεση έστω και μίας πρότασης δυσπιστίας κατά της κυβέρνησης όπως το θέλησαν δηλαδή τα κόμματα που την προκάλεσαν.', 'Γι’ αυτό και θα μιλήσω περισσότερο για την αλήθεια γύρω από αυτό το δυστύχημα το οποίο τόσο πλήγωσε την πατρίδα μας σύμφωνα τουλάχιστον πάντα με τη δική μου οπτική.']


## Vectorizer

In [8]:
stopwords = list(set(el_stop).union(set(en_stop)))
vectorizer_model = CountVectorizer(stop_words=stopwords, ngram_range=(1, 2))

## Evaluation metrics

In [None]:
coherence_metrics = {
    'docs': {
        "coherence_npmi": Coherence(texts=docs_corpus, topk=5, processes=1, measure='c_npmi'),
        "coherence_cv": Coherence(texts=docs_corpus, topk=5, processes=1, measure='c_v'),
        "coherence_umass": Coherence(texts=docs_corpus, topk=5, processes=1, measure='u_mass'),
        "coherence_uci": Coherence(texts=docs_corpus, topk=5, processes=1, measure='c_uci')
    },
    'sentences': {
        "coherence_npmi": Coherence(texts=sentences_corpus, topk=5, processes=1, measure='c_npmi'),
        "coherence_cv": Coherence(texts=sentences_corpus, topk=5, processes=1, measure='c_v'),
        "coherence_umass": Coherence(texts=sentences_corpus, topk=5, processes=1, measure='u_mass'),
        "coherence_uci": Coherence(texts=sentences_corpus, topk=5, processes=1, measure='c_uci')
    }
}

other_metrics = {
    'diversity_topic': TopicDiversity(topk=5),
    'diversity_kl': KLDivergence(),
    'similarity_rbo': RBO(topk=5),
    'similarity_pjs': PairwiseJaccardSimilarity(),
    'significance_kluni': KL_uniform()
}

all_metrics = {
    'coherence_metrics': coherence_metrics,
    'other_metrics': other_metrics}

## BERTopic model initialization

In [None]:
model_gr_docs = BERTopic(embedding_model=embeddings_gr, 
                         vectorizer_model=vectorizer_model, 
                         nr_topics=30)
model_gr_docs.fit_transform(docs, embeddings_gr)

model_gr_media_docs = BERTopic(embedding_model=embeddings_gr_media, 
                               vectorizer_model=vectorizer_model,
                               nr_topics=30)
model_gr_media_docs.fit_transform(docs, embeddings_gr_media)

model_gr_sentences = BERTopic(embedding_model=embeddings_gr_sentences, 
                              vectorizer_model=vectorizer_model,
                              nr_topics=30)
model_gr_sentences.fit_transform(sentences, embeddings_gr_sentences)

model_gr_media_sentences = BERTopic(embedding_model=embeddings_gr_media_sentences, 
                                    vectorizer_model=vectorizer_model,
                                    nr_topics=30)
model_gr_media_sentences.fit_transform(sentences, embeddings_gr_media_sentences)

In [None]:
models = {
    'gr_docs': model_gr_docs,
    'gr_media_docs': model_gr_media_docs,
    'gr_sentences': model_gr_sentences,
    'gr_media_sentences': model_gr_media_sentences
}