In [22]:
import pandas as pd

reducedPapers = pd.read_csv(
    "../processedData/processedData.csv",
    sep=',',
    index_col=0
)
reducedPapers.dropna(subset = ['processedText'], inplace = True, how='any')
reducedPapers.reset_index(drop=True, inplace=True)
reducedPapers.tail()
reducedPapers.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 152165 entries, 0 to 152164
Data columns (total 3 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   index          152165 non-null  int64 
 1   timePeriod     152165 non-null  object
 2   processedText  152165 non-null  object
dtypes: int64(1), object(2)
memory usage: 3.5+ MB


In [23]:
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from umap import UMAP
from hdbscan import HDBSCAN
from sklearn.feature_extraction.text import CountVectorizer

model = BERTopic(
    umap_model=UMAP(n_neighbors=5, n_components=3, min_dist=0.0), #, low_memory=True), UMAP(n_neighbors=5, n_components=3, min_dist=0.0)
    hdbscan_model=HDBSCAN(min_cluster_size=50, min_samples=5, prediction_data = True),
    embedding_model=SentenceTransformer('all-MiniLM-L6-v2'),
    vectorizer_model=CountVectorizer(ngram_range=(1, 2), stop_words='english'), #ngram_range=(1,2) -> ngram, 1 bis 2 wörter pro topic
    top_n_words=5,
    language='english',
    calculate_probabilities=False, #True
    verbose=True,
    nr_topics="auto"
)


In [24]:
import pickle
reducedPapers.loc[:, 'processedText'] = reducedPapers['processedText'].astype("str")
topics, _ = model.fit_transform(reducedPapers['processedText'])

Batches:   0%|          | 0/4756 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
with open('../processedData/topics.pickle', 'wb') as f:
    pickle.dump(topics, f)

In [None]:
hierarchical_topics = model.hierarchical_topics(reducedPapers['processedText'])
tree = model.get_topic_tree(hierarchical_topics)
print(tree)

In [None]:
fig = model.visualize_hierarchy(hierarchical_topics=hierarchical_topics, title='<b>Hierarchical Clustering</b>')
fig.write_image("../diagramOutput/hierarchicalClustering.png", engine="kaleido", scale=3, format='png')
fig.show()

In [None]:
model.visualize_heatmap()

In [None]:
import pickle
with open('../processedData/topics.pickle', 'wb') as f:
    pickle.dump(topics, f)

In [None]:
fig = model.visualize_barchart(top_n_topics=50, n_words=5, title='<b>Top Word Scores</b>')
fig.write_image("../diagramOutput/topicList.png", engine="kaleido", scale=3, format='png')
fig.show()

In [None]:
topics_over_time = model.topics_over_time(reducedPapers['processedText'],
                                                reducedPapers['timePeriod'],
                                                global_tuning=True,
                                              evolution_tuning=True,
                                                nr_bins=20)

with open('../processedData/topics_over_time.pickle', 'wb') as f:
    pickle.dump(topics_over_time, f)

In [None]:
#model.visualize_topics_over_time(topics_over_time, top_n_topics=5)
model.visualize_topics_over_time(topics_over_time)

fig = model.visualize_topics_over_time(topics_over_time, title='<b>Topics over Time</b>')
fig.write_image("../diagramOutput/topicsOverTime.png", engine="kaleido", scale=3, format='png')
fig.show()


In [None]:
fig = model.visualize_topics(title='<b>Intertopic Distance Map</b>')
fig.write_image("../diagramOutput/intertopicDistanceMap.png", engine="kaleido", scale=3, format='png')
fig.show()

In [None]:
similarWords = (
    "king "
    "queen "
    "royalty "
    "majesty "
    "crown "
    "prince "
    "princess "
)

In [None]:
similar_topics, similarity = model.find_topics(similarWords)
print(similar_topics)

In [None]:
model.get_topic(similar_topics[0])

In [None]:
fig = model.visualize_topics_over_time(topics_over_time, title='<b>Topics over Time</b>', topics=similar_topics)
fig.write_image("../diagramOutput/topicsOverTimeReduced.png", engine="kaleido", scale=3, format='png')
fig.show()

In [None]:
import pandas as pd
from umap import UMAP
%matplotlib inline

embeddings_vis = model._extract_embeddings(reducedPapers['processedText'], method="document")
umap_model_vis = UMAP(n_neighbors=15, n_components=2, min_dist=0.1, metric='cosine').fit(embeddings_vis)
df_vis = pd.DataFrame(umap_model_vis.embedding_, columns=["x", "y"])
df_vis["topic"] = topics

In [None]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Plot parameters
topicsNotToDisplay = [i for i in range(0, len(np.unique(topics))) if i not in similar_topics]
fontsize = 5 #12

to_plot = df_vis.copy()
#to_plot[df_vis.topic >= top_n] = -1
for tp in topicsNotToDisplay:
    to_plot.loc[to_plot.topic == tp, 'topic'] = -1

outliers = to_plot.loc[to_plot.topic == -1]
non_outliers = to_plot.loc[to_plot.topic != -1]

n_topics = len(np.unique(topics))
cmap = plt.get_cmap('hsv_r', n_topics)

fig, ax = plt.subplots(figsize=(20, 10), facecolor='white')
scatter_outliers = ax.scatter(outliers['x'], outliers['y'], c="#E0E0E0", s=1, alpha=.3)
scatter = ax.scatter(non_outliers['x'], non_outliers['y'], c=non_outliers['topic'], s=1, alpha=.3, cmap=cmap)

centroids = to_plot.groupby("topic").mean().reset_index().iloc[1:]
for row in centroids.iterrows():
   topic = int(row[1].topic)
   text = f"{topic}: " + "_".join([x[0] for x in model.get_topic(topic)[:5]])
   ax.text(row[1].x, row[1].y*1.01, text, fontsize=fontsize, horizontalalignment='center')

ax.text(0.99, 0.01, f"BERTopic - topics", transform=ax.transAxes, horizontalalignment="right", color="black")
plt.xticks([], [])
plt.yticks([], [])
plt.colorbar(scatter)

plt.savefig("../diagramOutput/clusterSelected.png", dpi=900, bbox_inches='tight')
plt.show()

In [None]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Plot parameters
top_n = 50
fontsize = 5 #12

to_plot = df_vis.copy()
to_plot[df_vis.topic >= top_n] = -1
outliers = to_plot.loc[to_plot.topic == -1]
non_outliers = to_plot.loc[to_plot.topic != -1]

n_topics = len(np.unique(topics))
cmap = plt.get_cmap('hsv_r', n_topics)

fig, ax = plt.subplots(figsize=(20, 10), facecolor='white')
scatter_outliers = ax.scatter(outliers['x'], outliers['y'], c="#E0E0E0", s=1, alpha=.3)
scatter = ax.scatter(non_outliers['x'], non_outliers['y'], c=non_outliers['topic'], s=1, alpha=.3, cmap=cmap)

centroids = to_plot.groupby("topic").mean().reset_index().iloc[1:]
for row in centroids.iterrows():
   topic = int(row[1].topic)
   text = f"{topic}: " + "_".join([x[0] for x in model.get_topic(topic)[:2]])
   ax.text(row[1].x, row[1].y*1.01, text, fontsize=fontsize, horizontalalignment='center')

ax.text(0.99, 0.01, f"BERTopic - Top {top_n} topics", transform=ax.transAxes, horizontalalignment="right", color="black")
plt.xticks([], [])
plt.yticks([], [])
plt.colorbar(scatter)

plt.savefig("../diagramOutput/cluster.png", dpi=900, bbox_inches='tight')
plt.show()

In [None]:
#model.visualize_hierarchical_documents(reducedPapers['processedText'], hierarchical_topics, embeddings=embeddings_vis)

In [None]:
# = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embeddings_vis)
#model.visualize_hierarchical_documents(reducedPapers['processedText'], hierarchical_topics, reduced_embeddings=reduced_embeddings)