In [3]:
import pandas as pd
import altair as alt

import pandas as pd
from umap import UMAP
from hdbscan import HDBSCAN
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer

from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired
from bertopic.vectorizers import ClassTfidfTransformer

alt.data_transformers.disable_max_rows()

# Daten laden
df = pd.read_csv('AI_Related_Papers_Cleaned.csv')


  from .autonotebook import tqdm as notebook_tqdm
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


In [4]:
docs = df["Abstract"].to_list()
assert len(docs) == len(df)

In [21]:
# Step 1 - Extract embeddings
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# Step 2 - Reduce dimensionality
umap_model = UMAP(
    n_neighbors=10,
    n_components=2,
    min_dist=0.0,
    metric='cosine',
    random_state=42 #Optional
    )

# Step 3 - Cluster reduced embeddings
hdbscan_model = HDBSCAN(
    min_cluster_size=5,
    metric='euclidean',
    cluster_selection_method='eom',
    prediction_data=True
    )

# Step 4 - Tokenize topics
vectorizer_model = CountVectorizer(stop_words=[])

# Step 5 - Create topic representation
ctfidf_model = ClassTfidfTransformer()

# Step 6 - (Optional) Fine-tune topic representations with
# a `bertopic.representation` model
representation_model = KeyBERTInspired()

# All steps together
topic_model = BERTopic(
    # Step 1 - Extract embeddings
    embedding_model=embedding_model,
    # Step 2 - Reduce dimensionality
    umap_model=umap_model,
    # Step 3 - Cluster reduced embeddings
    hdbscan_model=hdbscan_model,
    # Step 4 - Tokenize topics
    vectorizer_model=vectorizer_model,
    # Step 5 - Extract topic words
    ctfidf_model=ctfidf_model,
    # Step 6 - (Optional) Fine-tune topic represenations
    representation_model=representation_model
)

topics, probs = topic_model.fit_transform(docs)

# Dann auf Top 10 reduzieren
topic_model.reduce_topics(docs, nr_topics=11)

<bertopic._bertopic.BERTopic at 0x361fdad50>

In [22]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,171,-1_software_faults_developers_tools,"[software, faults, developers, tools, programs...",[On-board embedded software developed for spac...
1,0,198,0_code_program_software_patches,"[code, program, software, patches, automated, ...",[Automated program repair is the problem of au...
2,1,85,1_adversarial_vulnerabilities_vulnerability_vu...,"[adversarial, vulnerabilities, vulnerability, ...",[Deep learning (DL) plays a more and more impo...
3,2,65,2_maintainability_software_reliability_mainten...,"[maintainability, software, reliability, maint...","[In software reliability modeling, the paramet..."
4,3,29,3_testing_coverage_test_faults,"[testing, coverage, test, faults, mutants, mut...",[The test case generation is intrinsically a m...
5,4,26,4_metamodels_metamodel_modeling_specification,"[metamodels, metamodel, modeling, specificatio...",[Formal methods and supporting tools have a lo...
6,5,23,5_apps_android_app_gui,"[apps, android, app, gui, guis, mobile, runtim...",[Web applications are widely adopted and their...
7,6,22,6_concurrency_scheduling_protocols_deadlocks,"[concurrency, scheduling, protocols, deadlocks...","[In this paper, we describe Teapot, a domain-s..."
8,7,9,7_markovian_markov_stochastic_probabilistic,"[markovian, markov, stochastic, probabilistic,...",[Equivalence relations can be used to reduce t...
9,8,8,8_programming_programmers_developers_collabora...,"[programming, programmers, developers, collabo...",[Pair Programming is one of the most studied a...


In [23]:
topics_per_doc = topic_model.get_document_info(docs)
reprsentative_docs_0 = topics_per_doc[
    (topics_per_doc["Representative_document"] == True) &
    (topics_per_doc["Topic"] == 0)
    ]

In [24]:
# Visualisierung erstellen
fig = topic_model.visualize_documents(
    docs,
    topics=list(range(-1, len(topic_model.get_topics())-1)),  # Beginnt bei -1
    width=1200,
    height=1000
)

# Als HTML-Datei speichern
fig.write_html("topic_visualization.html")

In [25]:
reviews = df["Abstract"]

mean = reviews.str.split().apply(len).mean()
max = reviews.str.split().apply(len).max()

print(f"""Mittelwert: {mean}, Maximum: {max}""")

Mittelwert: 207.33177570093457, Maximum: 413


In [26]:
# Visualisierung der Topic-Interdistanzen erstellen
fig_interdistance_topic = topic_model.visualize_topics(
    width=850,
    height=650
)

# Als HTML-Datei speichern
fig_interdistance_topic.write_html("topic_interdistance.html")

In [27]:
# Balkendiagramm erstellen
fig_barchart = topic_model.visualize_barchart()

# Als HTML-Datei speichern
fig_barchart.write_html("topic_barchart.html")

In [28]:
# Heatmap erstellen
fig_heatmap = topic_model.visualize_heatmap(
    n_clusters=3,
    width=700,
    height=700
)

# Als HTML-Datei speichern
fig_heatmap.write_html("topic_heatmap.html")

In [29]:
# Zeitstempel extrahieren
timestamps = df["Publication Year"].to_list()

# Topics über Zeit berechnen
topics_over_time = topic_model.topics_over_time(docs, timestamps)

# Visualisierung erstellen
fig_topic_over_time = topic_model.visualize_topics_over_time(
    topics_over_time,
    width=1000
)

# Als HTML-Datei speichern
fig_topic_over_time.write_html("topics_over_time.html")