In [1]:
import pandas as pd
import altair as alt

import pandas as pd
from umap import UMAP
from hdbscan import HDBSCAN
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer

from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired
from bertopic.vectorizers import ClassTfidfTransformer

alt.data_transformers.disable_max_rows()

# Daten laden
df = pd.read_csv('Data/Dataset.csv')


ImportError: Unable to import required dependencies:
numpy: No module named 'numpy'

In [None]:
# Löschen von Zeilen mit fehlenden Werten
#df.dropna(subset=['Abstract', 'Publication Year'], inplace=True)

In [None]:
docs = df["Abstract"].to_list()
assert len(docs) == len(df)

In [None]:
# Step 1 - Extract embeddings
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# Step 2 - Reduce dimensionality
umap_model = UMAP(
    n_neighbors=10,
    n_components=2,
    min_dist=0.0,
    metric='cosine',
    random_state=42 #Optional
    )

# Step 3 - Cluster reduced embeddings
hdbscan_model = HDBSCAN(
    min_cluster_size=5,
    metric='euclidean',
    cluster_selection_method='eom',
    prediction_data=True
    )

# Step 4 - Tokenize topics
vectorizer_model = CountVectorizer(stop_words=[])

# Step 5 - Create topic representation
ctfidf_model = ClassTfidfTransformer()

# Step 6 - (Optional) Fine-tune topic representations with
# a `bertopic.representation` model
representation_model = KeyBERTInspired()

# All steps together
topic_model = BERTopic(
    # Step 1 - Extract embeddings
    embedding_model=embedding_model,
    # Step 2 - Reduce dimensionality
    umap_model=umap_model,
    # Step 3 - Cluster reduced embeddings
    hdbscan_model=hdbscan_model,
    # Step 4 - Tokenize topics
    vectorizer_model=vectorizer_model,
    # Step 5 - Extract topic words
    ctfidf_model=ctfidf_model,
    # Step 6 - (Optional) Fine-tune topic represenations
    representation_model=representation_model
)

topics, probs = topic_model.fit_transform(docs)

In [None]:
topic_model.get_topic_info()

In [None]:
topics_per_doc = topic_model.get_document_info(docs)
reprsentative_docs_0 = topics_per_doc[
    (topics_per_doc["Representative_document"] == True) &
    (topics_per_doc["Topic"] == 0)
    ]

In [None]:
fig = topic_model.visualize_documents(
    docs,
    width=800,
    height=800
    )
fig.show()

In [None]:
reviews = df["Abstract"]

mean = reviews.str.split().apply(len).mean()
max = reviews.str.split().apply(len).max()

print(f"""Mittelwert: {mean}, Maximum: {max}""")

In [None]:
fig_interdistance_topic = topic_model.visualize_topics(
    width = 800,
    height = 600)
fig_interdistance_topic

In [None]:
topic_model.visualize_barchart()

In [None]:
fig_heatmap = topic_model.visualize_heatmap(
    n_clusters=3,
    width = 700,
    height = 700
    )
fig_heatmap

In [None]:
timestamps = df["Publication Year"].to_list()

topics_over_time = topic_model.topics_over_time(docs, timestamps)

fig_topic_over_time = topic_model.visualize_topics_over_time(
    topics_over_time,
    width = 1000)
fig_topic_over_time

In [None]:
timestamps.sort()
print(timestamps)

In [None]:
classes = df["Title"].to_list()
topics_per_classes = topic_model.topics_per_class(docs, classes)
fig_topic_per_class = topic_model.visualize_topics_per_class(
    topics_per_classes, width = 2000, height = 2000)
fig_topic_per_class