In [7]:
import gensim.corpora as corpora
from gensim.models.coherencemodel import CoherenceModel
from bertopic.representation import PartOfSpeech,MaximalMarginalRelevance,KeyBERTInspired

import pandas as pd
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
import joblib
from typing import List, Dict
import re 

def clean_text(text):
    # Remove escape sequences
    text = re.sub(r'\\[a-zA-Z]', ' ', text)

    # Remove email addresses
    text = re.sub(r'\S+@\S+', ' ', text)

    # Remove punctuation
    text = re.sub(r'[^\w\s]', ' ', text)

    # Remove extra whitespace
    text = ' '.join(text.split())

    return text

def clean_list(listofstrings):
    docs= []
    for text in listofstrings:
        docs.append(clean_text(text))
    return docs

def load_model(model:int) :
    """Load the corresponding model based on the model number

    Args:
        model (int): ranging from 1 - 4 
    """
    if model == 1 :
        return BERTopic()
    elif model == 2 :
        representation_model = KeyBERTInspired()
        return BERTopic(representation_model=representation_model)
    elif model == 3 :
        representation_model = PartOfSpeech("en_core_web_sm")
        return BERTopic(representation_model=representation_model)
    else : 
        representation_model = MaximalMarginalRelevance(diversity=0.3)
        return BERTopic(representation_model=representation_model)
    
def find_coherence(docs,model_num,topk):
    filtered_text = docs
    topic_model = load_model(model_num)   
    topics, probabilities = topic_model.fit_transform(filtered_text)

    documents = pd.DataFrame({"Document": filtered_text,
                            "ID": range(len(filtered_text)),
                            "Topic": topics})
    documents_per_topic = documents.groupby(['Topic'], as_index=False).agg({'Document': ' '.join})
    cleaned_docs = topic_model._preprocess_text(documents_per_topic.Document.values)

    # Extract vectorizer and analyzer from BERTopic
    vectorizer = topic_model.vectorizer_model
    analyzer = vectorizer.build_analyzer()

    # Extract features for Topic Coherence evaluation
    words = vectorizer.get_feature_names_out()
    tokens = [analyzer(doc) for doc in cleaned_docs]
    dictionary = corpora.Dictionary(tokens)
    corpus = [dictionary.doc2bow(token) for token in tokens]
    topic_words = [[words for words, _ in topic_model.get_topic(topic)[:topk]] 
                for topic in range(len(set(topics))-1)]

    # Evaluate
    coherence_model = CoherenceModel(topics=topic_words, 
                                    texts=tokens, 
                                    corpus=corpus,
                                    dictionary=dictionary, 
                                    coherence='c_v') # {'u_mass', 'c_v', 'c_uci', 'c_npmi'}
    coherence = coherence_model.get_coherence()
    return coherence

In [23]:
from sklearn.datasets import fetch_20newsgroups
ng20 = fetch_20newsgroups(subset='all' , remove=('headers', 'footers', 'quotes'))['data']
nyt = list(pd.read_csv("data/nyt2020.csv")["text"])
wiki = list(pd.read_csv("data/wiki_en_10000.csv")["text"])

In [24]:
for topic_model in [1,2,3,4]:
    print(f"""
            20NG : Model {topic_model} : top k = 5 : {find_coherence(ng20,topic_model,5)}
            NYT : Model {topic_model} : top k = 5 : {find_coherence(nyt,topic_model,5)}
            WIKI : Model {topic_model} : top k = 5 : {find_coherence(wiki,topic_model,5)}
            
          """)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av


            20NG : Model 1 : top k = 5 : 0.7768279863456552
            NYT : Model 1 : top k = 5 : 0.8114295031803567
            WIKI : Model 1 : top k = 5 : 0.7459136392437462
            
          


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av


            20NG : Model 2 : top k = 5 : 0.7440544190477366
            NYT : Model 2 : top k = 5 : 0.7554800495727514
            WIKI : Model 2 : top k = 5 : 0.8083850873548543
            
          


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av


            20NG : Model 3 : top k = 5 : 0.7714149429864205
            NYT : Model 3 : top k = 5 : 0.7550870818948257
            WIKI : Model 3 : top k = 5 : 0.7711814762458248
            
          


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av


            20NG : Model 4 : top k = 5 : 0.7533958626476847
            NYT : Model 4 : top k = 5 : 0.7814181494925228
            WIKI : Model 4 : top k = 5 : 0.7342804697307697
            
          


In [25]:
for topic_model in [1,2,3,4]:
    print(f"""
            20NG : Model {topic_model} : top k = 10 : {find_coherence(ng20,topic_model,10)}
            NYT : Model {topic_model} : top k = 10 : {find_coherence(nyt,topic_model,10)}
            WIKI : Model {topic_model} : top k = 10 : {find_coherence(wiki,topic_model,10)}
            
          """)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av


            20NG : Model 1 : top k = 10 : 0.6927072464610986
            NYT : Model 1 : top k = 10 : 0.7282561161173324
            WIKI : Model 1 : top k = 10 : 0.6848766561380102
            
          


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av


            20NG : Model 2 : top k = 10 : 0.6379015372049903
            NYT : Model 2 : top k = 10 : 0.6061954749313321
            WIKI : Model 2 : top k = 10 : 0.7356807702152653
            
          


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av


            20NG : Model 3 : top k = 10 : 0.6685999832769977
            NYT : Model 3 : top k = 10 : 0.6328978955690443
            WIKI : Model 3 : top k = 10 : 0.7041345626787853
            
          


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av


            20NG : Model 4 : top k = 10 : 0.5933241886374133
            NYT : Model 4 : top k = 10 : 0.6184506617364761
            WIKI : Model 4 : top k = 10 : 0.646836403574678
            
          
