In [None]:
from sentence_transformers import SentenceTransformer
from top2vec import Top2Vec
import os
os.environ["TOKENIZERS_PARALLELISM"] = "true"

from pipeline.service._file import FileService

df = FileService.read_parquet_to_df("articles_lemmatized")
print(f"samples: {len(df)}")

umap_args = {
    "n_neighbors": 55,
    "n_components": 15,
    "metric": "cosine"
}
hdbscan_args = {
    'min_cluster_size': 20,
    'metric': "euclidean",
    'cluster_selection_method': "eom"
}


pretrained_model = SentenceTransformer('sentence-transformers/LaBSE')  
model = Top2Vec(
        documents=list(df["content"]), 
        embedding_model=pretrained_model.encode, 
        chunk_length=pretrained_model.max_seq_length, 
        chunk_overlap_ratio=.2,
        gpu_umap=False,
        umap_args=umap_args,
        gpu_hdbscan=False,
        hdbscan_args=hdbscan_args,
        speed="learn", 
        workers=4,
        verbose=True
    )

In [None]:
import os
import numpy as np
#model.save(os.path.normpath("./models/top2vec/labse-full-optimized"))

embeddings = model.document_vectors
#np.save(os.path.normpath("./models/vectorspaces/jan-jun-2020-embeddings.npy"), embeddings)

Reload Test

In [None]:
from top2vec import Top2Vec
from sentence_transformers import SentenceTransformer
import os

pretrained_model = SentenceTransformer('sentence-transformers/LaBSE')  
model2 = Top2Vec.load(os.path.normpath("./models/top2vec/labse-full-optimized"))
model2.set_embedding_model(pretrained_model)
model2.get_num_topics()


**After optimization:**  
93 Topics for german + french instead of over 180 for just french

# Trying to optimize umap after the fact (outdated)

In [None]:
v = model2.document_vectors
v.shape

In [None]:
import umap


umap_model = umap.UMAP(n_neighbors=5, n_components=2, metric='cosine', verbose=False).fit(v)
reduced2d = umap_model.transform(v)

In [None]:
import matplotlib.pyplot as plt
x, y = reduced2d[:,0], reduced2d[:,1]
plt.scatter(x,y)

In [None]:
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('default')

v_pos = abs(np.min(v)) + v
#v_pos = v
umap_model = umap.UMAP(min_dist=0.1 ,n_neighbors=15, n_components=3, metric='hellinger', verbose=False).fit(v_pos)
reduced3d = umap_model.transform(v_pos)

# Prepare 3D graph
fig = plt.figure()
ax = plt.axes(projection='3d')

# Plot scaled features
xdata = reduced3d[:,0]
ydata = reduced3d[:,1]
zdata = reduced3d[:,2]

# Plot 3D plot
ax.scatter3D(xdata, ydata, zdata, c=zdata, cmap='viridis')

plt.show()

In [None]:
print(sum(ydata > 8))
print(sum(xdata > 5))

# Cluster optimization

In [None]:
from hyperopt import tpe, hp, fmin, STATUS_OK, Trials, rand
import logging
import umap
import numpy as np
import hdbscan

from top2vec import Top2Vec
from sentence_transformers import SentenceTransformer
import os
import sys
import logging

logging.basicConfig(stream=sys.stdout,
                    level=logging.INFO,
                    format='%(asctime)s | [%(filename)s:%(lineno)d] %(levelname)s | %(message)s')
logger = logging.getLogger(__name__)

#dataset_name = "./models/top2vec/labse-fr-dataset"
dataset_name = "./models/top2vec/labse-full-dataset"

pretrained_model = SentenceTransformer('sentence-transformers/LaBSE', device="cpu")  
model2 = Top2Vec.load(os.path.normpath(dataset_name))
model2.set_embedding_model(pretrained_model)
print("topics of loaded model", model2.get_num_topics())
loaded_embeddings = model2.document_vectors


def hyperparameter_tuning(params, embeddings=loaded_embeddings, target_clusters=30, threshold_labels=50):
    umap_args = {
        "n_neighbors": params["n_neighbors"],
        "n_components": params["n_components"],
        "metric": params["umap_metric"]
    }
    hdbscan_args = {
        'min_cluster_size': params["min_cluster_size"],
        'metric': params["hdbscan_metric"],
        'cluster_selection_method': params["cluster_selection_method"]
    }
    
    if params["umap_metric"] == "hellinger":
        embeddings = abs(np.min(embeddings)) + embeddings 
    

    logging.debug(f"starting umap fitting")
    logging.debug(umap_args)
    clustered_embeddings = umap.UMAP(**umap_args).fit_transform(embeddings)

    logging.debug(f"starting dbscan fitting")
    logging.debug(hdbscan_args)
    cluster_labler = hdbscan.HDBSCAN(**hdbscan_args).fit(clustered_embeddings)

    labels = cluster_labler.labels_

    unique, counts = np.unique(labels, return_counts=True)
    clusters = dict(zip(unique, counts))
    sorted_labels = dict(sorted(clusters.items(), key=lambda item: item[1], reverse=True))
    n_labels = len({k:v for k, v in sorted_labels.items() if v > threshold_labels}) # don't count labels below threshold
    logging.debug(f"Found {n_labels} labels")
    metric = np.abs(target_clusters - n_labels)


    return {"n_labels": n_labels, "status": STATUS_OK, "loss": metric, "below_threshold": len(sorted_labels) - n_labels}

In [None]:
# Initialize trials object

n_neighbors = [20, 25, 40, 45, 55]
n_components = [5, 9, 13, 17, 20, 23]
umap_metric = ["hellinger", "cosine"]
min_cluster_size = [10, 15, 20]
hdbscan_metric = ["euclidean"]
cluster_selection_method = ["eom"]

space = {
    "n_neighbors": hp.choice("n_neighbors", n_neighbors),
    "n_components": hp.choice("n_components", n_components),
    "umap_metric": hp.choice("umap_metric", umap_metric),
    "min_cluster_size": hp.choice("min_cluster_size", min_cluster_size),
    "hdbscan_metric": hp.choice("hdbscan_metric", hdbscan_metric), 
    "cluster_selection_method": hp.choice("cluster_selection_method", cluster_selection_method)
}

trials = Trials()
best = fmin(
    fn=hyperparameter_tuning,
    space = space, 
    algo=tpe.rand.suggest, #tpe.suggest, 
    max_evals=200, 
    trials=trials
)

print("Best: {}".format(best))

In [None]:
from hyperopt import space_eval
space_eval(space, best)

In [None]:
from pprint import pprint
import pandas as pd
from pathlib import Path
import pickle
import json

#pprint([t for t in trials][0]["result"])

def unpack(x):
    if x:
        return x[0]
    return np.nan

trials_df = pd.DataFrame([pd.Series(t["misc"]["vals"]).apply(unpack) for t in trials])
trials_df["loss"] = [t["result"]["loss"] for t in trials]
trials_df["n_labels"] = [t["result"]["n_labels"] for t in trials]
trials_df["below_threshold"] = [t["result"]["below_threshold"] for t in trials]

trials_df["n_neighbors"] = trials_df["n_neighbors"].replace(list(range(len(n_neighbors))), n_neighbors)
trials_df["n_components"] = trials_df["n_components"].replace(list(range(len(n_components))), n_components)
trials_df["umap_metric"] = trials_df["umap_metric"].replace(list(range(len(umap_metric))), umap_metric)
trials_df["min_cluster_size"] = trials_df["min_cluster_size"].replace(list(range(len(min_cluster_size))), min_cluster_size)
trials_df["hdbscan_metric"] = trials_df["hdbscan_metric"].replace(list(range(len(hdbscan_metric))), hdbscan_metric)
trials_df["cluster_selection_method"] = trials_df["cluster_selection_method"].replace(list(range(len(cluster_selection_method))), cluster_selection_method)

trials_df.sort_values(by="loss", inplace=True)

experiment_params = {
    "n_neighbors": n_neighbors,
    "n_components": n_components,
    "umap_metric": umap_metric,
    "min_cluster_size": min_cluster_size,
    "hdbscan_metric": hdbscan_metric,
    "cluster_selection_method": cluster_selection_method,
}

experiment_start = [t for t in trials][0]["book_time"]
timestamp = experiment_start.strftime('%Y-%m-%d-%H-%M-%S')
experiment_folder = os.path.normpath(f"./umap-hdbet-hyperopt-{timestamp}/")
Path(experiment_folder).mkdir(parents=True, exist_ok=True)

with open(os.path.join(experiment_folder, "dataset-path.txt"), "w") as f:
    f.write(dataset_name)

with open(os.path.join(experiment_folder, "optimization-space.json"), "w") as f:
    json.dump(experiment_params, f)

trials_df.to_csv(os.path.join(experiment_folder, "optimization-results.csv"))

trials_df

In [None]:
# Reload from disk
import pandas as pd
import os

df = pd.read_csv(os.path.normpath("./umap-hdbet-hyperopt-2024-11-10-15-04-29/optimization-results.csv"))

In [None]:
trials_df.plot(x="n_neighbors", y="loss", style="o", title="Number of neighbors vs. loss")

In [None]:
trials_df.plot(x="n_components", y="loss", style="o", title="Number of Components vs. loss")

In [None]:
trials_df.plot(x="min_cluster_size", y="loss", style="o", title="Minimum cluster size vs. loss")

In [None]:
print(trials_df["umap_metric"].value_counts())
print(trials_df.iloc[:50]["umap_metric"].value_counts())

In [None]:
import matplotlib.pyplot as plt
correlation = trials_df[["min_cluster_size", "n_components", "n_neighbors", "loss"]].corr()
correlation.style.background_gradient(cmap='coolwarm').format(precision=2)

The best result is:
```py
{
    'cluster_selection_method': 'eom',
    'hdbscan_metric': 'euclidean',
    'min_cluster_size': 15,
    'n_components': 13,
    'n_neighbors': 40,
    'umap_metric': 'cosine'
}
```

Within the search space, all varibles have a negative correlation with the loss. As the variable increases the loss decreases. 

The custom loss function was to come close to approximately 30 labels since there are around 29 topics shown on the 20min webpage.  
For the best result we have ``n_neighbors = 40``, but if we inspect the top 10 results they tend to go higher. This confirms the intuition that we can reduce the number of labels by forcibly increasing the number of required neighbors. We shall choose ``55`` as we increase the corpus.   
The correlation for the number of `components` is very week and within the search space not significant. In the top 10 results the mean is `14.1` which is very close to the search space mean of `14.5`. We shall pick any number within the space.  
The UMAP `metric` is quite balanced, within the top 10 as well as the whole search space. We shall choose `cosine` as this will behave better for sparser datasets and is understood to be regularly used for NLP.  
The top 10 `min_cluster_size` with a mean of 17.5 is close to the search space mean of 15. We shall choose ``20`` as we increase the corpus.  

With this variable selection we can see that it would fit the second best result which also has the lowest number of labels below the threshold. So this can indicate more cohesion within the HDBSCAN labels.

The model shall be fitted with the following parameters:
```python
umap_args = {
    "n_neighbors": 55,
    "n_components": 15,
    "metric": "cosine"
}
hdbscan_args = {
    'min_cluster_size': 20,
    'metric': "euclidean",
    'cluster_selection_method': "eom"
}
```

In [None]:
trials_df.iloc[:10]

# Explore topics

In [None]:
model.get_num_topics()

In [None]:
topic_sizes, topic_nums = model.get_topic_sizes()

In [None]:
topic_words, word_scores, topic_nums = model.get_topics(5)

In [None]:
topic_nums

In [None]:
topic_words, word_scores, topic_scores, topic_nums = model.search_topics(keywords=["allemand"], num_topics=2)

In [None]:
print(topic_words, topic_scores, topic_nums)

In [None]:
#reduced_hierarchy = model.hierarchical_topic_reduction(75, interval=None)

In [None]:
for idx, topics in enumerate(reduced_hierarchy):
    if 187 in topics:
        print(idx)

In [None]:
model.generate_topic_wordcloud(86)

In [None]:
topic_words, word_scores, topic_scores, topic_nums = model.search_topics(keywords=["crypto"], num_topics=3)
for topic in topic_nums:
    model.generate_topic_wordcloud(topic, reduced=False)

In [None]:
df2 = FileService.read_parquet_to_df("articles_cleaned")

In [None]:
documents, document_scores, document_ids = model.search_documents_by_topic(topic_num=20, num_docs=129)

In [None]:
document_ids

In [None]:
df.iloc[5538]