In [None]:
pip install -U datasets

In [None]:
%%capture
!pip install --upgrade transformers==4.41.2 sentence-transformers==2.2.2 gensim==4.3.2 accelerate==0.31.0 peft==0.11.1 numpy==1.26.4

In [None]:
from datasets import disable_caching
disable_caching()

In [None]:
# Load data from hugging face
from datasets import load_dataset

dataset = load_dataset("maartengr/arxiv_nlp")["train"]
dataset

In [None]:
# Uninstall scikit-learn and scipy to resolve potential version conflicts
#!pip uninstall -y scikit-learn scipy

# Reinstall sentence-transformers to get compatible versions of dependencies
#!pip install --upgrade sentence-transformers

In [None]:
# Extract metadata

abstracts = dataset["Abstracts"]
titles = dataset["Titles"]

In [None]:
abstract_list = list(abstracts)

In [None]:
!pip install --upgrade sentence-transformers

In [None]:
from sentence_transformers import SentenceTransformer

embedding_model = SentenceTransformer('thenlper/gte-small')

# Encode the abstracts using the SentenceTransformer model
embeddings = embedding_model.encode(abstract_list, show_progress_bar=True)

embeddings.shape

# Dimensionality Reduction

In [None]:
from umap import UMAP

# Reduce the dimensions of the embeddings from 384 dimensions to 5 dimensions
umap_model = UMAP(n_components=5, metric='cosine', min_dist=0.0, random_state=42)
reduced_embeddings = umap_model.fit_transform(embeddings)

reduced_embeddings.shape

# Clustering

In [None]:
from hdbscan import HDBSCAN

# Fit the model and extract the clusters
hdbscan_model = HDBSCAN(min_cluster_size=50, metric='euclidean', cluster_selection_method='eom').fit(reduced_embeddings)
clusters = hdbscan_model.labels_

# How many clusters did we generate
print(len(set(clusters)))

# Inspecting the clusters

In [None]:
import numpy as np

In [None]:
# Print first three douments in the cluster
cluster = 0
for index in np.where(clusters==cluster)[0][:3]:
  print(abstract_list[index][:300] + "...\n")

In [None]:
import pandas as pd

# Reduce 384 dimensions to 2 dimensions for easier visualization
reduced_embeddings = UMAP(n_components=2, metric='cosine', min_dist=0.0, random_state=42).fit_transform(embeddings)

# Create dataframe
df = pd.DataFrame(reduced_embeddings, columns=['x', 'y'])
df['text'] = titles
df['cluster'] = [str(c) for c in clusters]

df.head()

In [None]:
# Select outliers and non-outliers (clusters)

to_plot = df.loc[df.cluster != "-1", :]
outliers = df.loc[df.cluster == "-1", :]

In [None]:
import matplotlib.pyplot as plt

In [None]:
# Plot outliers and non-outliers separately

plt.scatter(outliers.x, outliers.y, alpha=0.05, s=2, c="grey")
plt.scatter(to_plot.x, to_plot.y, c=to_plot.cluster.astype(int), s=2, alpha=0.6, cmap="tab20b")
plt.axis("off")

In [None]:
%%capture
!pip install bertopic

In [None]:
from bertopic import BERTopic

In [None]:
# Train the BERTopic model using previously defined embedding model, umap and hdbscan_model

topic_model = BERTopic(embedding_model=embedding_model, umap_model=umap_model, hdbscan_model=hdbscan_model, verbose=True).fit(abstract_list, embeddings)


In [None]:
topic_model.get_topic_info()

In [None]:
topic_model.get_topic(0)

In [None]:
topic_model.find_topics("topic_modeling")

# Visualize topics and documents

In [None]:
fig = topic_model.visualize_documents(
    list(titles),
    reduced_embeddings=reduced_embeddings,
    width=1200,
    #projection="umap",
    hide_annotations=True
)

# Update fonts of legend for easier visualizations
fig.update_layout(font=dict(size=16))
fig.show()

# Exploring various visualization techniques of BERTopic

In [None]:
# Visualizing barchart with ranked keywords
topic_model.visualize_barchart()

In [None]:
# Visualize relationships between topics
topic_model.visualize_heatmap(n_clusters=30)

In [None]:
# Visualize the potential hierarchical structure of topics
topic_model.visualize_hierarchy()

In [None]:
# Visualize topics and hierarchical relationships
topic_model.visualize_topics()

In [None]:
# Save original representations of BERTopic before applying re-ranking techniques
from copy import deepcopy

original_topics = deepcopy(topic_model.topic_representations_)

In [None]:
from bertopic.representation import KeyBERTInspired
from bertopic.representation import MaximalMarginalRelevance
from bertopic.representation import TextGeneration
from bertopic.representation import OpenAI
from transformers import pipeline
import openai

In [None]:
# Following is a wrapper that visualizes the differences in topic words with and without representation models
def topic_differences(model, original_topics, nr_topics=5):
  df = pd.DataFrame(columns=["Topic", "Original", "Updated"])

  for topic in range(nr_topics):
    # Extract top 5 words per topic
    og_words = " | ".join(list(zip(*original_topics[topic][1][:5])))
    new_words = " | ".join(list(zip(*model.get_topic(topic)[0][:5])))
    df.loc[len(df)] = [topic, og_words, new_words]

  return df

In [None]:
# Update our topic representations using KeyBERTInspired
representation_model = KeyBERTInspired()
topic_model.update_topics(abstracts, representation_model=representation_model)

In [None]:
# Show topic differences
topic_differences(topic_model, original_topics)

In [None]:
original_topics


In [None]:
# Update our topic representaions to MaximalMaginalRelevance
representation_model = MaximalMarginalRelevance()
topic_model.update_topics(abstracts, representation_model=representation_model)

# Show topic differences
topic_differences(topic_model, original_topics)

In [None]:
# Using text generation model such as Flan-T5 for topic representations

prompt = """ I have a topic that contains the following documents:
[DOCUMENTS]

The topic is described by the following keywords : '[KEYWORDS]'.

Based on the documents and the keywords, what is this topic about?"""

# Update our topic representations using Flan-T5
generator = pipeline("text2text-generation", model="google/flan-t5-small")
representation_model = TextGeneration(prompt=prompt, generator=generator, doc_length=50, tokenizer='whitespace')
topic_model.update_topics(abstracts, representation_model=representation_model)

# Show topic differences
topic_differences(topic_model, original_topics)

In [None]:
# Using a large text generation model with more linguistic capabilities - GPT-3.5 for topic representations

prompt = """ I have a topic that contains the following documents:
[DOCUMENTS]

The topic is described by the following keywords : '[KEYWORDS]'.

Based on the information above, extract a short topic label in the following format:
topic: <short topic label>
"""

# Update our topic representations using GPT-3.5
client = openai.OpenAI(api_key="YOUR_KEY_HERE")
representation_model = OpenAI(prompt=prompt, client=client, model='gpt-3.5-turbo', exponential_backoff=True, chat=True)
topic_model.update_topics(abstracts, representation_model=representation_model)

# Show topic differences
topic_differences(topic_model, original_topics)

In [None]:
# Visualize topics and documents

fig = topic_model.visualize_document_datamap(
    titles,
    topics=list(range(20)),
    reduced_embeddings=reduced_embeddings,
    width=1200,
    label_font_size=11,
    label_wrap_width=20,
    use_medoids=True
)

# Notes on BERTopic