In [62]:
import pandas as pd
import altair as alt

import pandas as pd
from umap import UMAP
from hdbscan import HDBSCAN
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer

from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired
from bertopic.vectorizers import ClassTfidfTransformer

alt.data_transformers.disable_max_rows()

# Daten laden
df = pd.read_csv('AI_Related_Papers_Cleaned.csv')


In [3]:
# Löschen von Zeilen mit fehlenden Werten
#df.dropna(subset=['Abstract', 'Publication Year'], inplace=True)

In [63]:
docs = df["Abstract"].to_list()
assert len(docs) == len(df)

In [102]:
# Step 1 - Extract embeddings
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# Step 2 - Reduce dimensionality
umap_model = UMAP(
    n_neighbors=10,
    n_components=2,
    min_dist=0.0,
    metric='cosine',
    random_state=42 #Optional
    )

# Step 3 - Cluster reduced embeddings
hdbscan_model = HDBSCAN(
    min_cluster_size=5,
    metric='euclidean',
    cluster_selection_method='eom',
    prediction_data=True
    )

# Step 4 - Tokenize topics
vectorizer_model = CountVectorizer(stop_words=[])

# Step 5 - Create topic representation
ctfidf_model = ClassTfidfTransformer()

# Step 6 - (Optional) Fine-tune topic representations with
# a `bertopic.representation` model
representation_model = KeyBERTInspired()

# All steps together
topic_model = BERTopic(
    # Step 1 - Extract embeddings
    embedding_model=embedding_model,
    # Step 2 - Reduce dimensionality
    umap_model=umap_model,
    # Step 3 - Cluster reduced embeddings
    hdbscan_model=hdbscan_model,
    # Step 4 - Tokenize topics
    vectorizer_model=vectorizer_model,
    # Step 5 - Extract topic words
    ctfidf_model=ctfidf_model,
    # Step 6 - (Optional) Fine-tune topic represenations
    representation_model=representation_model
)

topics, probs = topic_model.fit_transform(docs)

# Dann auf Top 8 reduzieren
topic_model.reduce_topics(docs, nr_topics=9)

<bertopic._bertopic.BERTopic at 0x35c501f70>

In [103]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,171,-1_software_faults_developers_tools,"[software, faults, developers, tools, programs...","[The use of Artificial Intelligence (AI), and ..."
1,0,286,0_code_program_developers_software,"[code, program, developers, software, automate...",[The Transformer architecture and transfer lea...
2,1,85,1_adversarial_vulnerabilities_vulnerability_vu...,"[adversarial, vulnerabilities, vulnerability, ...",[Deep learning (DL) plays a more and more impo...
3,2,29,2_testing_coverage_test_faults,"[testing, coverage, test, faults, mutants, mut...",[The test case generation is intrinsically a m...
4,3,26,3_metamodels_specification_metamodel_modeling,"[metamodels, specification, metamodel, modelin...",[Formal methods and supporting tools have a lo...
5,4,22,4_concurrency_scheduling_protocols_deadlocks,"[concurrency, scheduling, protocols, deadlocks...","[In this paper, we describe Teapot, a domain-s..."
6,5,9,5_markovian_markov_stochastic_probabilistic,"[markovian, markov, stochastic, probabilistic,...",[Equivalence relations can be used to reduce t...
7,6,8,6_programming_programmers_developers_collabora...,"[programming, programmers, developers, collabo...",[Pair Programming is one of the most studied a...
8,7,6,7_icse_conference_2022_ieee,"[icse, conference, 2022, ieee, 2023, acm, 2024...",[What's new with ICSE 2022 ICSE 2022 is here! ...


In [104]:
topics_per_doc = topic_model.get_document_info(docs)
reprsentative_docs_0 = topics_per_doc[
    (topics_per_doc["Representative_document"] == True) &
    (topics_per_doc["Topic"] == 0)
    ]

In [105]:
# Visualisierung erstellen
fig = topic_model.visualize_documents(
    docs,
    topics=list(range(len(topic_model.get_topics()))),  # Stellt sicher, dass bei 0 begonnen wird
    width=800,
    height=800
)

# Als HTML-Datei speichern
fig.write_html("topic_visualization.html")

In [106]:
reviews = df["Abstract"]

mean = reviews.str.split().apply(len).mean()
max = reviews.str.split().apply(len).max()

print(f"""Mittelwert: {mean}, Maximum: {max}""")

Mittelwert: 207.33177570093457, Maximum: 413


In [110]:
# Visualisierung der Topic-Interdistanzen erstellen
fig_interdistance_topic = topic_model.visualize_topics(
    width=850,
    height=650
)

# Als HTML-Datei speichern
fig_interdistance_topic.write_html("topic_interdistance.html")

In [111]:
# Balkendiagramm erstellen
fig_barchart = topic_model.visualize_barchart()

# Als HTML-Datei speichern
fig_barchart.write_html("topic_barchart.html")

In [112]:
# Heatmap erstellen
fig_heatmap = topic_model.visualize_heatmap(
    n_clusters=3,
    width=700,
    height=700
)

# Als HTML-Datei speichern
fig_heatmap.write_html("topic_heatmap.html")

In [113]:
# Zeitstempel extrahieren
timestamps = df["Publication Year"].to_list()

# Topics über Zeit berechnen
topics_over_time = topic_model.topics_over_time(docs, timestamps)

# Visualisierung erstellen
fig_topic_over_time = topic_model.visualize_topics_over_time(
    topics_over_time,
    width=1000
)

# Als HTML-Datei speichern
fig_topic_over_time.write_html("topics_over_time.html")

In [95]:
timestamps.sort()
print(timestamps)

[1994, 1994, 1994, 1994, 1994, 1994, 1994, 1995, 1995, 1995, 1995, 1995, 1995, 1995, 1995, 1995, 1995, 1996, 1996, 1996, 1996, 1996, 1997, 1998, 1998, 1998, 1998, 1999, 1999, 1999, 1999, 1999, 1999, 1999, 1999, 1999, 1999, 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2001, 2001, 2001, 2001, 2001, 2002, 2002, 2002, 2002, 2002, 2002, 2002, 2003, 2003, 2003, 2003, 2003, 2003, 2003, 2004, 2004, 2004, 2004, 2004, 2004, 2004, 2005, 2005, 2005, 2005, 2005, 2005, 2005, 2005, 2006, 2006, 2006, 2006, 2006, 2006, 2007, 2007, 2007, 2007, 2007, 2007, 2007, 2008, 2008, 2008, 2008, 2008, 2008, 2008, 2008, 2008, 2008, 2008, 2009, 2009, 2009, 2009, 2009, 2009, 2010, 2010, 2010, 2010, 2010, 2010, 2011, 2011, 2011, 2011, 2011, 2011, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2013, 2013, 2013, 2013, 201

In [100]:
# Vorhandene Spalten anzeigen
print("Verfügbare Spalten im DataFrame:")
print(df.columns.tolist())

Verfügbare Spalten im DataFrame:
['Document Title', 'Authors', 'Author Affiliations', 'Publication Title', 'Date Added To Xplore', 'Publication Year', 'Volume', 'Issue', 'Start Page', 'End Page', 'Abstract', 'ISSN', 'ISBNs', 'DOI', 'Funding Information', 'PDF Link', 'Author Keywords', 'IEEE Terms', 'Mesh_Terms', 'Article Citation Count', 'Patent Citation Count', 'Reference Count', 'License', 'Online Date', 'Issue Date', 'Meeting Date', 'Publisher', 'Document Identifier', 'is_ai_related']


In [101]:
# Angepasster Code mit dem korrekten Spaltennamen
classes = df["Document Title"].to_list()
topics_per_classes = topic_model.topics_per_class(docs, classes)
fig_topic_per_class = topic_model.visualize_topics_per_class(
    topics_per_classes, 
    width=2000, 
    height=2000
)

# Als HTML-Datei speichern
fig_topic_per_class.write_html("topics_per_class.html")