In [1]:
# import necessary libraries
import itertools
import pandas as pd
import numpy as np
import ast
from gensim import corpora
from gensim.models import CoherenceModel
from sklearn.metrics import silhouette_samples, silhouette_score

In [2]:
def force_format(texts):
    return [str(t) for t in texts]

In [3]:
def compute_word_occurences(texts):
    words = itertools.chain.from_iterable(texts)
    word_count = pd.Series(words).value_counts()
    word_count = pd.DataFrame({"Word": word_count.index, "Count": word_count.values})
    return word_count

In [4]:
def get_l_texts(text_file):
    l_texts=[]
    with open(text_file, "r") as f:
        line = f.readlines()
        list_line = [l.strip() for l in line]
        for l in list_line:
            l_texts.append(ast.literal_eval(l))
    return l_texts

In [5]:
dataset = pd.read_json("News_Category_Dataset_v2.json", lines=True, dtype={"headline": str})

In [6]:
texts = force_format(dataset["headline"])

In [7]:
l_texts = get_l_texts("l_texts.txt")

# BERT

In [8]:
#Import
import bertopic


  from .autonotebook import tqdm as notebook_tqdm


In [None]:
representation=bertopic.representation.KeyBERTInspired()
model_trained= bertopic.BERTopic(representation_model=representation)
topics,probs = model_trained.fit_transform(dataset['headline'])
model_trained.visualize_topics()


In [14]:
dictionary = corpora.Dictionary(l_texts)
coherence_model= CoherenceModel(model=model_trained, texts=l_texts, dictionary=dictionary, coherence='c_v')
coherence_score = coherence_model.get_coherence()
print("Coherence Score: ", coherence_score)

Coherence Score:  1.0


In [None]:
result=pd.DataFrame(columns=['min_size_topic','Coherence Score',"Number of topics"])
for min_size_topic in range(10, 200, 10):
    model = bertopic.BERTopic(representation_model=representation,min_topic_size=min_size_topic, verbose=True)
    topics,probs = model.fit_transform(dataset['headline'])
    print("min_size_topic =", min_size_topic, "Number of topics :", len(np.unique(topics)))
    coherence_model= CoherenceModel(model=model, texts=l_texts, dictionary=dictionary, coherence='c_v')
    coherence_score = coherence_model.get_coherence()
    print("min_size_topic =", min_size_topic, "Coherence Score: ", coherence_score)
    result=result.append({'min_size_topic':min_size_topic,'Coherence Score':coherence_score,"Number of topics":len(np.unique(topics))},ignore_index=True)

In [20]:
result.head()
result.to_csv("result.csv",index=False)