# Environment

In [1]:
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
%%capture
!pip install bertopic
!pip install datasets
!pip install -U plotly==5.3.1
!pip install -U kaleido
!pip install matplotlib
!pip install -U numpy==1.23.5

In [3]:
import pandas as pd
from bertopic import BERTopic
import os
from os.path import join as opj
import numpy as np
import pandas as pd
import time

# Data

In [4]:
news_df = pd.read_csv("/content/drive/MyDrive/ML_project/new_text_withstem.csv")
# Extract

cleaned_text = news_df['headline_cleaned_text']
original_text = news_df['headline_text']
cleaned_text = [cleaned_text[i] if isinstance(cleaned_text[i],str) else original_text[i] for i in range(len(cleaned_text))]

num_samples = 100000
data = cleaned_text[:num_samples]

# Interface

In [5]:

def pipeline_single_setting(data, output_root, embedding_model,DR_model,clustering_model,vectorizer_model,representation_model,output_n_topics_list = (8,20,50), embeddings = None,reduced_embeddings = None):
  if embeddings is None:
    try:
      embeddings = embedding_model.encode(data, show_progress_bar=True)
    except Exception:
      embeddings = embedding_model.fit_transform(data)

  if reduced_embeddings is None:
    try:
      reduced_embeddings = DR_model.fit_transform(embeddings)
    except:
      print(1)
      transformed_embeddings = np.abs(embeddings)
      print(2)
      reduced_embeddings = DR_model.fit_transform(transformed_embeddings)
      print(3)

  topic_model = BERTopic(
  # Pipeline models
  embedding_model=embedding_model,
  umap_model=DR_model,
  hdbscan_model=clustering_model,
  vectorizer_model=vectorizer_model,
  representation_model=representation_model,

  # Hyperparameters
  top_n_words=30,
  # nr_topics = nr_topics,
  verbose=True
)
  topics, probs = topic_model.fit_transform(data, embeddings)
  topics_distr, _ = topic_model.approximate_distribution(data, window=8, stride=4)
  topic_representation_df = topic_model.get_topic_info()


  os.makedirs(output_root, exist_ok = True)
  embeddings_path = opj(output_root, "embeddings.npy")
  np.save(embeddings_path, embeddings)
  reduced_embeddings_path = opj(output_root, "reduced_embeddings.npy")
  np.save(reduced_embeddings_path, reduced_embeddings_path)
  topic_representation_path = opj(output_root, "topic_representations.csv")
  topic_representation_df.to_csv(topic_representation_path, index = False)
  model_path = opj(output_root, "model")
  topic_model.save(model_path)
  topics_path = opj(output_root, "topics.npy")
  np.save(topics_path, topics)
  probs_path = opj(output_root, "probs.npy")
  np.save(probs_path, probs)
  topics_distr_path = opj(output_root, "topics_distr.npy")
  np.save(topics_distr_path, topics_distr)

  _, num_topics = topics_distr.shape
  for output_n_topics in output_n_topics_list:
    if output_n_topics > num_topics:
      continue

    diagrams_output_root = opj(output_root, f"diagrams_top_{output_n_topics}_topics")
    os.makedirs(diagrams_output_root, exist_ok = True)
    diagram_map = get_topic_model_diagrams(topic_model,num_topics, embeddings,data,topics, output_n_topics = output_n_topics)
    for name, diagram in diagram_map.items():
      path = opj(diagrams_output_root, f"{name}.png")
      diagram.write_image(path)


def get_topic_model_diagrams(topic_model, num_topics,embeddings,data, topics, output_n_topics):
  output = {}
  # MAX_TOPICS = 20
  # output_n_topics = MAX_TOPICS if num_topics > MAX_TOPICS else num_topics

  topic_distance_map = topic_model.visualize_topics(top_n_topics=output_n_topics)

  try:
    topic_similarity_heat_map = topic_model.visualize_heatmap(width=1000, height=1000,top_n_topics = output_n_topics)
  except Exception:
    topic_similarity_heat_map = None

  try:
    time_chart = topic_model.visualize_topics_over_time(topics_over_time = topics, topics = [i+1 for i in range(output_n_topics)])
  except Exception:
    time_chart = None

  visualize_samples = 100000
  sample_percentage = visualize_samples / len(data) if  len(data) > visualize_samples else None
  # sample_percentage = 0.2
  try:
    reduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embeddings)
    tsne_documents_distribution = topic_model.visualize_documents(data,
                                  reduced_embeddings=reduced_embeddings,sample = sample_percentage,
                                  custom_labels=True,hide_annotations=True, topics = [i+1 for i in range(output_n_topics)])
  except Exception as e:
    print(e)
    pass
  # data_map_documents_distribution = topic_model.visualize_document_datamap(data, reduced_embeddings=reduced_embeddings,
  #                                                   custom_labels=True, topics = [i+1 for i in range(output_n_topics)])
  try:
    top_5_word_scores = topic_model.visualize_barchart(top_n_topics = output_n_topics, n_words = 5,height = 300, width = 300) ##
    top_10_word_scores = topic_model.visualize_barchart(top_n_topics = output_n_topics, n_words = 10,height = 300, width = 300) ##
  except Exception as e:
    print(e)
    pass

  try:
    term_rank = topic_model.visualize_term_rank(topics = [i+1 for i in range(output_n_topics)])
  except Exception as e:
    print(e)
    pass

  try:
    topic_hierarchy = topic_model.visualize_hierarchy(top_n_topics=output_n_topics)
  except Exception:
    topic_hierarchy = None


  # diagrams = ["topic_distance_map", "topic_similarity_heat_map", "time_chart", "tsne_documents_distribution","data_map_documents_distribution",
  #             "top_5_word_scores", "top_10_word_scores","term_rank", "topic_hierarchy"]
  diagrams = ["topic_distance_map", "topic_similarity_heat_map", "time_chart", "tsne_documents_distribution","data_map_documents_distribution",
              "top_5_word_scores", "top_10_word_scores","term_rank", "topic_hierarchy"]
  for name in diagrams:
    if name in locals():
      diagram = locals()[name]
      if diagram is not None:
        output[name] = diagram

  return output

# invoke

model pools

In [None]:
from sklearn.cluster import AgglomerativeClustering

DEFAULT_N_COMPONENTS = 5
DEFAULT_N_CLUSTERS = 8
MIN_CLUSTER_SIZE = 50

###

from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import TfidfVectorizer, HashingVectorizer, CountVectorizer,TfidfTransformer

tf_idf_emb = TfidfVectorizer()
hashing_emb = HashingVectorizer()
word_count_emb = CountVectorizer()
# tf_idf_transformer_emb = TfidfTransformer()

traditional_emb_models_dict = {
    "tf_idf_emb": tf_idf_emb,
    # "hashing_emb": hashing_emb,
    "word_count_emb": word_count_emb,
}

base_sentence_transformer_emb = SentenceTransformer("all-MiniLM-L6-v2")
sota_sentence_transformer_emb = SentenceTransformer("paraphrase-multilingual-mpnet-base-v2")
small_sentence_transformer_emb =  SentenceTransformer("paraphrase-albert-small-v2")
semantic_search_emb =  SentenceTransformer("multi-qa-mpnet-base-dot-v1")
bing_search_emb =  SentenceTransformer("msmarco-bert-base-dot-v5")
clip_emb = SentenceTransformer("clip-ViT-L-14")
google_questions_emb = SentenceTransformer("nq-distilbert-base-v1")
meta_questions_emb = SentenceTransformer("facebook-dpr-ctx_encoder-multiset-base")


deep_emb_model_dict = {
"base_sentence_transformer_emb": base_sentence_transformer_emb,
"sota_sentence_transformer_emb": sota_sentence_transformer_emb,
"small_sentence_transformer_emb": small_sentence_transformer_emb,
"semantic_search_emb": semantic_search_emb,
"bing_search_emb": bing_search_emb,
"clip_emb": clip_emb,
"google_questions_emb": google_questions_emb,
"meta_questions_emb": meta_questions_emb
}

embeddings_model_dict = {**traditional_emb_models_dict, **deep_emb_model_dict}
advanced_embeddings_model_dict = {
    "sota_sentence_transformer_emb": sota_sentence_transformer_emb,
    "clip_emb": clip_emb,
}

from umap import UMAP
from sklearn.decomposition import TruncatedSVD,LatentDirichletAllocation,DictionaryLearning, PCA

LDA_dr = LatentDirichletAllocation(n_components=DEFAULT_N_COMPONENTS)
LSA_dr = TruncatedSVD(n_components=DEFAULT_N_COMPONENTS)
dict_dr = DictionaryLearning(n_components = DEFAULT_N_COMPONENTS)
umap_dr = UMAP(n_neighbors=15, n_components=DEFAULT_N_COMPONENTS, min_dist=0.0, metric='cosine', random_state=42)
PCA_dr = PCA(n_components=DEFAULT_N_COMPONENTS)

dr_model_dict = {
    'LDA': LDA_dr,
    'LSA': LSA_dr,
    'DictionaryLearning': dict_dr,
    'umap': umap_dr,
    'PCA': PCA_dr
}

###

# Improving Default Representation
from hdbscan import HDBSCAN
from sklearn.cluster import KMeans

k_means_cluster = KMeans(n_clusters=DEFAULT_N_CLUSTERS)
HDBSCAN_cluster = HDBSCAN(min_cluster_size=20, metric='euclidean', cluster_selection_method='eom', prediction_data=True)


###
# Controlling Number of Topics
from sklearn.feature_extraction.text import CountVectorizer

CountVectorizer_vectorizer = CountVectorizer(stop_words="english", min_df=2, ngram_range=(1, 2))

from bertopic.representation import KeyBERTInspired

keybert_representation = KeyBERTInspired()

representation_model = {
     KeyBERTInspired()
}


# tf-idf

In [10]:


k_means_cluster = KMeans(n_clusters=5)

input_root = "/content/drive/MyDrive/ML_project/output/100000_test_embeddings/tf_idf_emb/"
subtype = "HDBSCAN_cluster__and__umap_dr"

# embeddings = np.load(opj(input_root, subtype, "embeddings.npy"), allow_pickle=True)
# reduced_embeddings = np.load(opj(input_root, subtype, "reduced_embeddings.npy"), allow_pickle=True)
embeddings = None
reduced_embeddings = None


output_root = f"/content/drive/MyDrive/ML_project/output/tf_idf/first_round/{subtype}"
# pipeline_single_setting(data, output_root, embedding_model = tf_idf_emb ,DR_model = LSA_dr,
#                         clustering_model = k_means_cluster ,vectorizer_model = CountVectorizer_vectorizer,
#                         representation_model = representation_model,
#                         output_n_topics_list = (8,20,50), embeddings = embeddings,reduced_embeddings = reduced_embeddings)

# pipeline_single_setting(data, output_root, embedding_model = tf_idf_emb ,DR_model = umap_dr,
#                         clustering_model = HDBSCAN_cluster ,vectorizer_model = CountVectorizer_vectorizer,
#                         representation_model,
#                         output_n_topics_list = (8,20,50), embeddings = embeddings,reduced_embeddings = reduced_embeddings):

In [16]:
representation_model = {
    "KeyBERT": keybert_representation
}

In [17]:
embedding_model = tf_idf_emb
DR_model = LSA_dr,
clustering_model = k_means_cluster
vectorizer_model = CountVectorizer_vectorizer
output_n_topics_list = [8,20,50]

if embeddings is not None:
  if reduced_embeddings is None:
    try:
      reduced_embeddings = DR_model.fit_transform(embeddings)
    except:
      print(1)
      transformed_embeddings = np.abs(embeddings)
      print(2)
      reduced_embeddings = DR_model.fit_transform(transformed_embeddings)
      print(3)

topic_model = BERTopic(
# Pipeline models
embedding_model=embedding_model,
umap_model=DR_model,
hdbscan_model=clustering_model,
vectorizer_model=vectorizer_model,
representation_model=representation_model,

# Hyperparameters
top_n_words=30,
# nr_topics = nr_topics,
verbose=True
)


2024-05-11 14:06:17,715 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/3125 [00:00<?, ?it/s]

2024-05-11 14:06:45,220 - BERTopic - Embedding - Completed ✓
2024-05-11 14:06:45,222 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm


AttributeError: 'tuple' object has no attribute 'fit'

In [18]:
topic_model.umap_model

(TruncatedSVD(n_components=5),)

In [19]:
topics, probs = topic_model.fit_transform(data, embeddings)

2024-05-11 14:09:34,215 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/3125 [00:00<?, ?it/s]

2024-05-11 14:09:58,597 - BERTopic - Embedding - Completed ✓
2024-05-11 14:09:58,598 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm


AttributeError: 'tuple' object has no attribute 'fit'

In [None]:


topics, probs = topic_model.fit_transform(data, embeddings)
topics_distr, _ = topic_model.approximate_distribution(data, window=8, stride=4)
topic_representation_df = topic_model.get_topic_info()


os.makedirs(output_root, exist_ok = True)
embeddings_path = opj(output_root, "embeddings.npy")
np.save(embeddings_path, embeddings)
reduced_embeddings_path = opj(output_root, "reduced_embeddings.npy")
np.save(reduced_embeddings_path, reduced_embeddings_path)
topic_representation_path = opj(output_root, "topic_representations.csv")
topic_representation_df.to_csv(topic_representation_path, index = False)
model_path = opj(output_root, "model")
topic_model.save(model_path)
topics_path = opj(output_root, "topics.npy")
np.save(topics_path, topics)
probs_path = opj(output_root, "probs.npy")
np.save(probs_path, probs)
topics_distr_path = opj(output_root, "topics_distr.npy")
np.save(topics_distr_path, topics_distr)

_, num_topics = topics_distr.shape
for output_n_topics in output_n_topics_list:
  if output_n_topics > num_topics:
    continue

  diagrams_output_root = opj(output_root, f"diagrams_top_{output_n_topics}_topics")
  os.makedirs(diagrams_output_root, exist_ok = True)
  diagram_map = get_topic_model_diagrams(topic_model,num_topics, embeddings,data,topics, output_n_topics = output_n_topics)
  for name, diagram in diagram_map.items():
    path = opj(diagrams_output_root, f"{name}.png")
    diagram.write_image(path)

AttributeError                            Traceback (most recent call last)
<ipython-input-17-61928b50f658> in <cell line: 30>()
     28 verbose=True
     29 )
---> 30 topics, probs = topic_model.fit_transform(data, embeddings)
     31 topics_distr, _ = topic_model.approximate_distribution(data, window=8, stride=4)
     32 topic_representation_df = topic_model.get_topic_info()

1 frames
/usr/local/lib/python3.10/dist-packages/bertopic/_bertopic.py in _reduce_dimensionality(self, embeddings, y, partial_fit)
   3470                 # cuml umap needs y to be an numpy array
   3471                 y = np.array(y) if y is not None else None
-> 3472                 self.umap_model.fit(embeddings, y=y)
   3473             except TypeError:
   3474

AttributeError: 'tuple' object has no attribute 'fit'

test dr and cluster

In [7]:


# num_samples = 100000
# data = cleaned_text[:num_samples]

# # embedding_models = {"paraphrase-multilingual-mpnet-base-v2": , }

# cluster_models = {"HDBSCAN_cluster": HDBSCAN_cluster,"k_means_8_cluster": KMeans(n_clusters=8)
#   , "k_means_50_cluster": KMeans(n_clusters=50)}
# # cluster_models = {"AgglomerativeCluster": AgglomerativeClustering(n_clusters=50)}

# embedding_model = base_sentence_transformer_emb
# vectorizer_model = CountVectorizer_vectorizer
# representation_model = {
#     "KeyBERT": keybert_representation,
# }



# root = "/content/drive/MyDrive/ML_project/output/100000_test_dr_and_clustering"

# for m,clustering_model in cluster_models.items():
#   for dr,DR_model in dr_model_dict.items():
#     try:
#       output_root = opj(root, f"{m}__and__{dr}")
#       pipeline_single_setting(data, output_root, embedding_model,DR_model,clustering_model,vectorizer_model,representation_model)

#       import time
#       print(f"finish {m}__and__{dr} at {time.ctime(time.time())}")
#     except Exception as e:
#       print(f"finish {m}__and__{dr} error: {e}")



test embeddings

In [None]:
num_samples = 100000
data = cleaned_text[:num_samples]

root = "/content/drive/MyDrive/ML_project/output/100000_test_embeddings"
base_cluster_models = {"HDBSCAN_cluster": HDBSCAN_cluster, "k_means_8_cluster": KMeans(n_clusters=8), }
base_dr_models = {"umap_dr":umap_dr,"LSA_dr": LSA_dr }
vectorizer_model = CountVectorizer_vectorizer
representation_model = {
    "KeyBERT": keybert_representation,
}


for e,embedding_model in advanced_embeddings_model_dict.items():
    try:
        embeddings = embedding_model.encode(data, show_progress_bar=True)
    except Exception:
        embeddings = embedding_model.fit_transform(data)

    for m,clustering_model in base_cluster_models.items():
        for dr,DR_model in base_dr_models.items():
          import time
          try:
              output_root = opj(root, e,f"{m}__and__{dr}")
              print(f"# ◅▯◊║◊▯▻   Start {m}__and__{dr} at {time.ctime(time.time())}    ◅▯◊║◊▯▻")
              pipeline_single_setting(data, output_root, embedding_model,DR_model,clustering_model,vectorizer_model,representation_model, embeddings = embeddings)

              print(f" ◅▯◊║◊▯▻   finish {m}__and__{dr} at {time.ctime(time.time())}    ◅▯◊║◊▯▻")
          except Exception as exception:
              print(f"# ◅▯◊║◊▯▻  {m}__and__{dr}  Error: {exception}    ◅▯◊║◊▯▻")



# Bugs

finish k_means_8_cluster__and__LSA_dr error: Found array with dim 3. None expected <= 2

2024-05-10 05:36:59,314 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-05-10 05:42:24,707 - BERTopic - Dimensionality - Completed ✓
2024-05-10 05:42:24,712 - BERTopic - Cluster - Start clustering the reduced embeddings
/usr/local/lib/python3.10/dist-packages/joblib/externals/loky/backend/fork_exec.py:38: RuntimeWarning:

os.fork() was called. os.fork() is incompatible with multithreaded code, and JAX is multithreaded, so this will likely lead to a deadlock.

2024-05-10 05:42:35,318 - BERTopic - Cluster - Completed ✓
2024-05-10 05:42:35,349 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-05-10 05:43:30,698 - BERTopic - Representation - Completed ✓
100%|██████████| 100/100 [00:03<00:00, 26.05it/s]
2024-05-10 05:43:39,311 - BERTopic - WARNING: When you use `pickle` to save/load a BERTopic model,please make sure that the environments in which you saveand load the model are **exactly** the same. The version of BERTopic,its dependencies, and python need to remain the same.
finish HDBSCAN_cluster__and__umap_dr error: Found array with dim 3. None expected <= 2.
2024-05-10 05:45:23,769 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-05-10 05:45:24,045 - BERTopic - Dimensionality - Completed ✓
2024-05-10 05:45:24,050 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-05-10 05:45:32,909 - BERTopic - Cluster - Completed ✓
2024-05-10 05:45:32,930 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-05-10 05:45:36,632 - BERTopic - Representation - Completed ✓
100%|██████████| 100/100 [00:02<00:00, 49.68it/s]
2024-05-10 05:45:38,965 - BERTopic - WARNING: When you use `pickle` to save/load a BERTopic model,please make sure that the environments in which you saveand load the model are **exactly** the same. The version of BERTopic,its dependencies, and python need to remain the same.
finish HDBSCAN_cluster__and__LSA_dr error: Found array with dim 3. None expected <= 2.
2024-05-10 05:49:00,671 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-05-10 05:54:09,043 - BERTopic - Dimensionality - Completed ✓
2024-05-10 05:54:09,048 - BERTopic - Cluster - Start clustering the reduced embeddings
Exception ignored on calling ctypes callback function: <function ThreadpoolController._find_libraries_with_dl_iterate_phdr.<locals>.match_library_callback at 0x7dc5444c7c70>
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/threadpoolctl.py", line 1005, in match_library_callback
    self._make_controller_from_path(filepath)
  File "/usr/local/lib/python3.10/dist-packages/threadpoolctl.py", line 1175, in _make_controller_from_path
    lib_controller = controller_class(
  File "/usr/local/lib/python3.10/dist-packages/threadpoolctl.py", line 114, in __init__
    self.dynlib = ctypes.CDLL(filepath, mode=_RTLD_NOLOAD)
  File "/usr/lib/python3.10/ctypes/__init__.py", line 374, in __init__
    self._handle = _dlopen(self._name, mode)
OSError: /usr/local/lib/python3.10/dist-packages/numpy.libs/libopenblas64_p-r0-5007b62f.3.23.dev.so: cannot open shared object file: No such file or directory
2024-05-10 05:54:10,371 - BERTopic - Cluster - Completed ✓
2024-05-10 05:54:10,392 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-05-10 05:54:12,461 - BERTopic - Representation - Completed ✓
100%|██████████| 100/100 [00:02<00:00, 49.27it/s]
2024-05-10 05:54:14,618 - BERTopic - WARNING: When you use `pickle` to save/load a BERTopic model,please make sure that the environments in which you saveand load the model are **exactly** the same. The version of BERTopic,its dependencies, and python need to remain the same.
finish k_means_8_cluster__and__umap_dr error: Found array with dim 3. None expected <= 2.
2024-05-10 05:54:53,710 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-05-10 05:54:53,987 - BERTopic - Dimensionality - Completed ✓
2024-05-10 05:54:53,995 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-05-10 05:54:55,926 - BERTopic - Cluster - Completed ✓
2024-05-10 05:54:55,947 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-05-10 05:54:57,938 - BERTopic - Representation - Completed ✓
100%|██████████| 100/100 [00:01<00:00, 50.24it/s]
2024-05-10 05:55:00,072 - BERTopic - WARNING: When you use `pickle` to save/load a BERTopic model,please make sure that the environments in which you saveand load the model are **exactly** the same. The version of BERTopic,its dependencies, and python need to remain the same.
finish k_means_8_cluster__and__LSA_dr error: Found array with dim 3. None expected <= 2.
2024-05-10 05:58:14,774 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-05-10 06:03:28,855 - BERTopic - Dimensionality - Completed ✓
2024-05-10 06:03:28,859 - BERTopic - Cluster - Start clustering the reduced embeddings
/usr/local/lib/python3.10/dist-packages/joblib/externals/loky/backend/fork_exec.py:38: RuntimeWarning:

os.fork() was called. os.fork() is incompatible with multithreaded code, and JAX is multithreaded, so this will likely lead to a deadlock.

2024-05-10 06:03:37,755 - BERTopic - Cluster - Completed ✓
2024-05-10 06:03:37,785 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-05-10 06:04:21,466 - BERTopic - Representation - Completed ✓
100%|██████████| 100/100 [00:03<00:00, 28.64it/s]
2024-05-10 06:04:27,319 - BERTopic - WARNING: When you use `pickle` to save/load a BERTopic model,please make sure that the environments in which you saveand load the model are **exactly** the same. The version of BERTopic,its dependencies, and python need to remain the same.
finish HDBSCAN_cluster__and__umap_dr error: Found array with dim 3. None expected <= 2.
2024-05-10 06:04:56,391 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-05-10 06:04:56,684 - BERTopic - Dimensionality - Completed ✓
2024-05-10 06:04:56,689 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-05-10 06:05:04,446 - BERTopic - Cluster - Completed ✓
2024-05-10 06:05:04,468 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-05-10 06:05:12,064 - BERTopic - Representation - Completed ✓
100%|██████████| 100/100 [00:02<00:00, 42.17it/s]
2024-05-10 06:05:15,052 - BERTopic - WARNING: When you use `pickle` to save/load a BERTopic model,please make sure that the environments in which you saveand load the model are **exactly** the same. The version of BERTopic,its dependencies, and python need to remain the same.
finish HDBSCAN_cluster__and__LSA_dr error: Found array with dim 3. None expected <= 2.
2024-05-10 06:08:28,921 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-05-10 06:13:41,340 - BERTopic - Dimensionality - Completed ✓
2024-05-10 06:13:41,344 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-05-10 06:13:42,645 - BERTopic - Cluster - Completed ✓
2024-05-10 06:13:42,665 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-05-10 06:13:44,736 - BERTopic - Representation - Completed ✓
100%|██████████| 100/100 [00:02<00:00, 47.76it/s]
2024-05-10 06:13:46,960 - BERTopic - WARNING: When you use `pickle` to save/load a BERTopic model,please make sure that the environments in which you saveand load the model are **exactly** the same. The version of BERTopic,its dependencies, and python need to remain the same.
finish k_means_8_cluster__and__umap_dr error: Found array with dim 3. None expected <= 2.
2024-05-10 06:14:19,209 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-05-10 06:14:19,572 - BERTopic - Dimensionality - Completed ✓
2024-05-10 06:14:19,576 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-05-10 06:14:21,228 - BERTopic - Cluster - Completed ✓
2024-05-10 06:14:21,249 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-05-10 06:14:23,274 - BERTopic - Representation - Completed ✓
100%|██████████| 100/100 [00:02<00:00, 49.88it/s]
2024-05-10 06:14:25,400 - BERTopic - WARNING: When you use `pickle` to save/load a BERTopic model,please make sure that the environments in which you saveand load the model are **exactly** the same. The version of BERTopic,its dependencies, and python need to remain the same.
finish k_means_8_cluster__and__LSA_dr error: Found array with dim 3. None expected <= 2.
Batches: 100%
 3125/3125 [00:24<00:00, 135.77it/s]
2024-05-10 06:16:57,063 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-05-10 06:18:58,276 - BERTopic - Dimensionality - Completed ✓
2024-05-10 06:18:58,280 - BERTopic - Cluster - Start clustering the reduced embeddings
/usr/local/lib/python3.10/dist-packages/joblib/externals/loky/backend/fork_exec.py:38: RuntimeWarning:

os.fork() was called. os.fork() is incompatible with multithreaded code, and JAX is multithreaded, so this will likely lead to a deadlock.

2024-05-10 06:19:06,319 - BERTopic - Cluster - Completed ✓
2024-05-10 06:19:06,351 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-05-10 06:19:49,395 - BERTopic - Representation - Completed ✓
100%|██████████| 100/100 [00:03<00:00, 26.90it/s]
2024-05-10 06:19:55,938 - BERTopic - WARNING: When you use `pickle` to save/load a BERTopic model,please make sure that the environments in which you saveand load the model are **exactly** the same. The version of BERTopic,its dependencies, and python need to remain the same.
finish HDBSCAN_cluster__and__umap_dr at Fri May 10 06:23:04 2024
2024-05-10 06:23:04,790 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-05-10 06:23:05,378 - BERTopic - Dimensionality - Completed ✓
2024-05-10 06:23:05,382 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-05-10 06:23:17,273 - BERTopic - Cluster - Completed ✓
2024-05-10 06:23:17,293 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-05-10 06:23:19,202 - BERTopic - Representation - Completed ✓
100%|██████████| 100/100 [00:01<00:00, 51.93it/s]
2024-05-10 06:23:21,458 - BERTopic - WARNING: When you use `pickle` to save/load a BERTopic model,please make sure that the environments in which you saveand load the model are **exactly** the same. The version of BERTopic,its dependencies, and python need to remain the same.
finish HDBSCAN_cluster__and__LSA_dr at Fri May 10 06:23:22 2024
2024-05-10 06:25:26,416 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-05-10 06:27:25,506 - BERTopic - Dimensionality - Completed ✓
2024-05-10 06:27:25,509 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-05-10 06:27:28,477 - BERTopic - Cluster - Completed ✓
2024-05-10 06:27:28,499 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-05-10 06:27:30,553 - BERTopic - Representation - Completed ✓
100%|██████████| 100/100 [00:02<00:00, 46.63it/s]
2024-05-10 06:27:33,050 - BERTopic - WARNING: When you use `pickle` to save/load a BERTopic model,please make sure that the environments in which you saveand load the model are **exactly** the same. The version of BERTopic,its dependencies, and python need to remain the same.
finish k_means_8_cluster__and__umap_dr at Fri May 10 06:28:31 2024
2024-05-10 06:28:31,654 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-05-10 06:28:32,133 - BERTopic - Dimensionality - Completed ✓
2024-05-10 06:28:32,137 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-05-10 06:28:34,345 - BERTopic - Cluster - Completed ✓
2024-05-10 06:28:34,366 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-05-10 06:28:37,243 - BERTopic - Representation - Completed ✓
100%|██████████| 100/100 [00:02<00:00, 46.24it/s]
2024-05-10 06:28:39,771 - BERTopic - WARNING: When you use `pickle` to save/load a BERTopic model,please make sure that the environments in which you saveand load the model are **exactly** the same. The version of BERTopic,its dependencies, and python need to remain the same.
finish k_means_8_cluster__and__LSA_dr at Fri May 10 06:29:22 2024
Batches:  18%
 558/3125 [00:07<00:31, 80.44it/s]
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-26-b7d41118abe4> in <cell line: 13>()
     14   try:
     15     try:
---> 16       embeddings = embedding_model.encode(data, show_progress_bar=True)
     17     except Exception:
     18       embeddings = embedding_model.fit_transform(data)

20 frames
/usr/local/lib/python3.10/dist-packages/torch/nn/functional.py in softmax(input, dim, _stacklevel, dtype)
   1826
   1827
-> 1828 def softmax(input: Tensor, dim: Optional[int] = None, _stacklevel: int = 3, dtype: Optional[DType] = None) -> Tensor:
   1829     r"""Apply a softmax function.
   1830

KeyboardInterrupt:


# Test

setting

In [9]:

# root = "/content/drive/MyDrive/ML_project/output/100000_first_round"
# num_samples = 10000
# data = cleaned_text[:num_samples]

# cluster_models = {"HDBSCAN_clustering": HDBSCAN_clustering,"k_means_clustering": k_means_clustering}
# dr_models = {"umap_dr":umap_dr,"LDA_dr": LDA_dr,"LSA_dr": LSA_dr ,"dict_dr": LSA_dr}

# embedding_model = sentence_transformer_embedding
# vectorizer_model = CountVectorizer_vectorizer
# representation_model = {
#     "KeyBERT": keybert_representation,
# }
# clustering_model = HDBSCAN_clustering
# DR_model = umap_dr



run

In [10]:


# embeddings = embedding_model.encode(data, show_progress_bar=True)
# reduced_embeddings = DR_model.fit_transform(embeddings)

# topic_model = BERTopic(
# # Pipeline models
# embedding_model=embedding_model,
# umap_model=DR_model,
# hdbscan_model=clustering_model,
# vectorizer_model=vectorizer_model,
# representation_model=representation_model,

# # Hyperparameters
# top_n_words=30,
# # nr_topics = nr_topics,
# verbose=True
# )

# topics, probs = topic_model.fit_transform(data, embeddings)
# topics_distr, _ = topic_model.approximate_distribution(data, window=8, stride=4)
# topic_representation_df = topic_model.get_topic_info()

# _, num_topics = topics_distr.shape


# MAX_TOPICS = 50
# output_n_topics = MAX_TOPICS if num_topics > MAX_TOPICS else num_topics




setting 2: k means

In [11]:
# embeddings = embedding_model.encode(data, show_progress_bar=True)
# reduced_embeddings = DR_model.fit_transform(embeddings)

# clustering_model1 = k_means_clustering

# topic_model1 = BERTopic(
# # Pipeline models
# embedding_model=embedding_model,
# umap_model=DR_model,
# hdbscan_model=clustering_model1,
# vectorizer_model=vectorizer_model,
# representation_model=representation_model,

# # Hyperparameters
# top_n_words=30,
# # nr_topics = nr_topics,
# verbose=True
# )

# topics1, probs1 = topic_model1.fit_transform(data, embeddings)
# topics_distr1, _ = topic_model.approximate_distribution(data, window=8, stride=4)
# topic_representation_df1 = topic_model.get_topic_info()

# _, num_topics1 = topics_distr.shape



heat map


In [12]:
# # !pip install -U numpy==1.20
# topic_similarity_heat_map = topic_model.visualize_heatmap(width=1000, height=1000,top_n_topics = output_n_topics)
# topic_similarity_heat_map

# Tools

tsne extension

In [13]:
def get_a_group_of_document_dirtribution_diagrams(input_root, output_root,data):
  os.makedirs(output_root, exist_ok = True)
  topic_model = BERTopic.load(opj(input_root, "model"))
  topics_distr = np.load(opj(output_root, "topics_distr.npy"))
  _, num_topics = topics_distr.shape
  embeddings = np.load(opj(input_root, "embeddings.npy"))

  reduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embeddings)
  print("Finish reduce embeddings.")

  visualize_samples_list = [500, 2000, 10000,20000,50000,100000]
  output_topics_list = [8,20,50]
  for visualize_samples in visualize_samples_list:
    for output_n_topics in output_topics_list:
      if num_topics < output_n_topics:
        continue

      try:
        diagram = get_document_dirtribution_diagram(visualize_samples, topic_model, output_n_topics, data,reduced_embeddings)
      except Exception as e:
        print(e)
        continue

      output_path = opj(output_root, f"samples_{visualize_samples}_topics_{output_n_topics}.png")
      diagram.write_image(output_path)
      print(f"finish samples_{visualize_samples}_topics_{output_n_topics}")



def get_document_dirtribution_diagram(visualize_samples, topic_model,  output_n_topics, data,reduced_embeddings):
  sample_percentage = visualize_samples / len(data) if  len(data) > visualize_samples else None
  # sample_percentage = 0.2
  tsne_documents_distribution = topic_model.visualize_documents(data,
                                reduced_embeddings=reduced_embeddings,sample = sample_percentage,
                                custom_labels=True,hide_annotations=True, topics = [i+1 for i in range(output_n_topics)])
  return tsne_documents_distribution

def add_document_dirtribution_diagrams_for_all(input_root,data):
  for root, dirs, files in os.walk(input_root):
    for f in files:
      if f == "model":
        file_path = opj(root, f)
        print(f"Start {file_path} at {time.ctime(time.time())}")

        input_root = os.path.dirname(file_path)
        output_root = opj(input_root, "documents_distribusion_graphs")
        try:
          get_a_group_of_document_dirtribution_diagrams(input_root, output_root,data)
        except Exception as exception:
          print(f"Error: {exception}")

        print(f"Finish {file_path} at {time.ctime(time.time())}")

In [14]:
# input_root = "/content/drive/MyDrive/ML_project/output"
# add_document_dirtribution_diagrams_for_all(input_root,data)

FileNotFoundError: [Errno 2] No such file or directory: '/content/drive/MyDrive/ML_project/output/100000_first_round/HDBSCAN_clustering__and__umap_dr/documents_distribusion_graphs/topics_distr.npy'


In [15]:
# input_root = "/content/drive/MyDrive/ML_project/output/100000_test_dr_and_clustering/k_means_50_cluster__and__LSA"
# output_root = opj(input_root, "documents_distribusion_graphs")
# get_a_group_of_document_dirtribution_diagrams(input_root, output_root,data)

# Metrics

!pip install octis

In [2]:


import ast

from octis.evaluation_metrics.diversity_metrics import TopicDiversity
from octis.evaluation_metrics.coherence_metrics import Coherence

import csv
import os
from os.path import join as opj
import pandas as pd

news_df = pd.read_csv("/content/drive/MyDrive/ML_project/new_text_withstem.csv")
# Extract

cleaned_text = news_df['headline_cleaned_text']
original_text = news_df['headline_text']
num_samples = 100000
cleaned_text = cleaned_text[:num_samples]
cleaned_text = [cleaned_text[i] if isinstance(cleaned_text[i],str) else original_text[i] for i in range(len(cleaned_text))]

def get_topic_metrics(cleaned_text:str,bertopic:list,topk:int,measure="c_uci"):
    bertopic_topics=[]
    for topic in bertopic:
        topics = ast.literal_eval(topic)
        bertopic_topics.append(topics)
        # print(topics)
    corpus = [text.split(" ") for text in cleaned_text]
    coherence = Coherence(texts=corpus,
                            topk=topk, measure=measure)

    diversity = TopicDiversity(topk=topk)

    print(1)
    coherence = coherence.score({"topics": bertopic_topics})
    print(2)
    diversity = diversity.score({"topics": bertopic_topics})
    print(4)

    return coherence, diversity


def evaluate_group_models(input_root,subdirname, output_path,topk = 10,measure = "c_uci"):
    dirs = os.listdir(input_root)
    output_list = []

    for directory in dirs:
        if directory.startswith("."):
            continue

        model_name = directory[:-4]
        csv_path = opj(opj(input_root,directory,subdirname,"topic_representations.csv"))
        df = pd.read_csv(csv_path)

        bertopic = df.loc[:, "Representation"].tolist()
        coherence, diversity = get_topic_metrics(cleaned_text,bertopic,topk,measure=measure)
        print(model_name, coherence,diversity)
        output_list.append([model_name,f"{coherence:.2f}",f"{diversity:.2f}"] )
    headers = ["models", "TC", "TD"]

    output_root = os.path.dirname(output_path)
    os.makedirs(output_root, exist_ok = True)
    with open(output_path, "w+") as csv_file:
        writer = csv.writer(csv_file)
        writer.writerow(headers)
        writer.writerows(output_list)





In [3]:

input_root = "/content/drive/MyDrive/ML_project/output/100000_test_embeddings"
csv_output_root = "/content/drive/MyDrive/ML_project/output/evaluation"
subdirname = "HDBSCAN_cluster__and__umap_dr"
output_path = opj(csv_output_root,"embeddings.csv")

measure = "c_v"
evaluate_group_models(input_root,subdirname, output_path,measure = measure)

1
2
4
tf_idf 0.4419690947192578 0.8797631862217438
1
2
4
word_count 0.42112122552002795 0.8481481481481481
1
2
4
base_sentence_transformer 0.38628539387283184 0.8472144846796658
1
2
4
sota_sentence_transformer 0.3761648219120399 0.8384251968503937
1
2
4
clip 0.3968068287973832 0.8484240687679083


NotADirectoryError: [Errno 20] Not a directory: '/content/drive/MyDrive/ML_project/output/100000_test_embeddings/sota_sentence_transformer_emb.zip/HDBSCAN_cluster__and__umap_dr/topic_representations.csv'