In [27]:
import pandas as pd
import altair as alt
import numpy as np

from umap import UMAP
from hdbscan import HDBSCAN
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer
from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired
from bertopic.vectorizers import ClassTfidfTransformer

# Deaktiviere Altair Row Limit
alt.data_transformers.disable_max_rows()

# Daten laden
print("Lade Daten...")
df = pd.read_csv('AI_Related_Papers_Cleaned.csv')
print("Verfügbare Spalten:", df.columns.tolist())
print("\nDatensatz Größe:", len(df))

# Extrahiere Dokumente für Topic Modeling
docs = df["Abstract"].to_list()
print("\nAnzahl der Abstracts:", len(docs))


Lade Daten...
Verfügbare Spalten: ['Document Title', 'Authors', 'Author Affiliations', 'Publication Title', 'Date Added To Xplore', 'Publication Year', 'Volume', 'Issue', 'Start Page', 'End Page', 'Abstract', 'ISSN', 'ISBNs', 'DOI', 'Funding Information', 'PDF Link', 'Author Keywords', 'IEEE Terms', 'Mesh_Terms', 'Article Citation Count', 'Patent Citation Count', 'Reference Count', 'License', 'Online Date', 'Issue Date', 'Meeting Date', 'Publisher', 'Document Identifier', 'is_ai_related']

Datensatz Größe: 642

Anzahl der Abstracts: 642


In [28]:
# 1. Publikationstrend-Analyse
print("\nAnalysiere Publikationstrends...")
yearly_pubs = df.groupby('Publication Year').size().reset_index(name='count')

# Erstelle ein vollständiges DataFrame für alle Jahre von 1994 bis 2024
all_years = pd.DataFrame({'Publication Year': range(1994, 2025)})
yearly_pubs = pd.merge(all_years, yearly_pubs, on='Publication Year', how='left')
yearly_pubs['count'] = yearly_pubs['count'].fillna(0)

# Erstelle den Basis-Linienplot
base = alt.Chart(yearly_pubs).encode(
    x=alt.X('Publication Year:Q',
            scale=alt.Scale(domain=[1994, 2024]),
            axis=alt.Axis(
                tickCount=yearly_pubs['Publication Year'].nunique(),
                format='d'
            ),
            title='Jahr'),
    y=alt.Y('count:Q',
            title='Anzahl Publikationen')
)

# Linienplot mit Punkten
lines = base.mark_line()
points = base.mark_point(size=50)

# Text-Labels für die Datenpunkte
text = base.mark_text(
    align='center',
    baseline='bottom',
    dy=-10  # Verschiebung nach oben
).encode(
    text=alt.Text('count:Q', format='.0f')  # Formatierung als ganze Zahl
)

# Kombiniere alle Elemente
pub_trend_chart = (lines + points + text).properties(
    title='Entwicklung der KI-Publikationen im Software Engineering',
    width=1000,
    height=500
).configure_axis(
    grid=True,
    gridOpacity=0.3
).configure_view(
    stroke=None
)

pub_trend_chart.save('publication_trends.html')
print("Publikationstrend-Visualisierung gespeichert als 'publication_trends.html'")


Analysiere Publikationstrends...
Publikationstrend-Visualisierung gespeichert als 'publication_trends.html'


In [29]:
# 2. Topic Modeling
print("\nFühre Topic Modeling durch...")
# Embedding Model
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# UMAP Model
umap_model = UMAP(
    n_neighbors=10,
    n_components=2,
    min_dist=0.0,
    metric='cosine',
    random_state=42
)

# HDBSCAN Model
hdbscan_model = HDBSCAN(
    min_cluster_size=5,
    metric='euclidean',
    cluster_selection_method='eom',
    prediction_data=True
)

# Topic Model
topic_model = BERTopic(
    embedding_model=embedding_model,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    vectorizer_model=CountVectorizer(stop_words=[]),
    ctfidf_model=ClassTfidfTransformer(),
    representation_model=KeyBERTInspired()
)

# Fit Model
topics, probs = topic_model.fit_transform(docs)
topic_model.reduce_topics(docs, nr_topics=11)

# Topic Info ausgeben
print("\nGefundene Topics:")
print(topic_model.get_topic_info())

# Topic Visualisierungen erstellen und speichern
print("\nErstelle Topic-Visualisierungen...")


Führe Topic Modeling durch...

Gefundene Topics:
    Topic  Count                                               Name  \
0      -1    171                -1_software_faults_developers_tools   
1       0    198                    0_code_program_software_patches   
2       1     85  1_adversarial_vulnerabilities_vulnerability_vu...   
3       2     65  2_maintainability_software_reliability_mainten...   
4       3     29                     3_testing_coverage_test_faults   
5       4     26      4_metamodels_metamodel_modeling_specification   
6       5     23                             5_apps_android_app_gui   
7       6     22       6_concurrency_scheduling_protocols_deadlocks   
8       7      9        7_markovian_markov_stochastic_probabilistic   
9       8      8  8_programming_programmers_developers_collabora...   
10      9      6                        9_icse_conference_2022_ieee   

                                       Representation  \
0   [software, faults, developers, tools

In [30]:
# Document Visualization
fig_docs = topic_model.visualize_documents(
    docs,
    topics=list(range(-1, len(topic_model.get_topics())-1)),
    width=1200,
    height=1000
)
fig_docs.write_html("topic_visualization.html")
print("Dokument-Visualisierung gespeichert als 'topic_visualization.html'")

Dokument-Visualisierung gespeichert als 'topic_visualization.html'


In [31]:
# Topic Distance Visualization
fig_distance = topic_model.visualize_topics(
    width=850,
    height=650
)
fig_distance.write_html("topic_interdistance.html")
print("Topic-Distanz-Visualisierung gespeichert als 'topic_interdistance.html'")

Topic-Distanz-Visualisierung gespeichert als 'topic_interdistance.html'


In [32]:
# Barchart Visualization
fig_barchart = topic_model.visualize_barchart()
fig_barchart.write_html("topic_barchart.html")
print("Barchart-Visualisierung gespeichert als 'topic_barchart.html'")

Barchart-Visualisierung gespeichert als 'topic_barchart.html'


In [33]:
# Heatmap Visualization
fig_heatmap = topic_model.visualize_heatmap(
    n_clusters=3,
    width=700,
    height=700
)
fig_heatmap.write_html("topic_heatmap.html")
print("Heatmap-Visualisierung gespeichert als 'topic_heatmap.html'")

Heatmap-Visualisierung gespeichert als 'topic_heatmap.html'


In [34]:
# 3. Zeitliche Entwicklung der Topics
print("\nAnalysiere zeitliche Entwicklung der Topics...")
timestamps = df["Publication Year"].tolist()
topics_over_time = topic_model.topics_over_time(docs, timestamps)
fig_time = topic_model.visualize_topics_over_time(
    topics_over_time,
    width=1000
)
fig_time.write_html("topics_over_time.html")
print("Zeitliche Entwicklung gespeichert als 'topics_over_time.html'")



Analysiere zeitliche Entwicklung der Topics...
Zeitliche Entwicklung gespeichert als 'topics_over_time.html'


In [35]:
# 4. Technologie-Trend-Analyse
print("\nAnalysiere Technologie-Trends...")
tech_keywords = tech_categories = {
    'Machine Learning': ['machine learning', 'ml', 'supervised learning', 'unsupervised learning'],
    'Deep Learning': ['deep learning', 'neural network', 'cnn', 'rnn', 'lstm'],
    'Natural Language Processing': ['nlp', 'natural language processing', 'text analysis', 'language model'],
    'Computer Vision': ['computer vision', 'image processing', 'object detection'],
    'Reinforcement Learning': ['reinforcement learning', 'rl', 'q-learning'],
    'Expert Systems': ['expert system', 'knowledge base', 'rule-based']
}

tech_trends = pd.DataFrame()
for keyword in tech_keywords:
    tech_trends[keyword] = df['Abstract'].str.contains(keyword, case=False).astype(int)

# Aggregiere nach Jahr
yearly_tech_trends = pd.concat([
    df['Publication Year'],
    tech_trends
], axis=1).groupby('Publication Year').sum().reset_index()

# Erstelle ein vollständiges DataFrame für alle Jahre von 1994 bis 2024
all_years = pd.DataFrame({'Publication Year': range(1994, 2025)})
yearly_tech_trends = pd.merge(all_years, yearly_tech_trends, on='Publication Year', how='left')
yearly_tech_trends = yearly_tech_trends.fillna(0)

# Erstelle eine Altair-Visualisierung
tech_trend_chart = alt.Chart(
    yearly_tech_trends.melt('Publication Year', var_name='Technology', value_name='Count')
).mark_line(point=True).encode(
    x=alt.X('Publication Year:Q',
            scale=alt.Scale(domain=[1994, 2024]),
            axis=alt.Axis(
                tickCount=yearly_tech_trends['Publication Year'].nunique(),
                format='d'  # 'd' format für ganze Zahlen
            ),
            title='Jahr'),
    y=alt.Y('Count:Q', title='Anzahl der Erwähnungen'),
    color=alt.Color('Technology:N', title='Technologie'),
    tooltip=['Publication Year:Q', 'Technology:N', 'Count:Q']
).properties(
    width=1000,
    height=600,
    title='Entwicklung verschiedener KI-Technologien über die Zeit'
)

# Speichere die Visualisierung als HTML
tech_trend_chart.save('technology_trends.html')
print("Technologie-Trend-Visualisierung gespeichert als 'technology_trends.html'")


Analysiere Technologie-Trends...
Technologie-Trend-Visualisierung gespeichert als 'technology_trends.html'


In [36]:
# 5. Zitationsanalyse
print("\nFühre Zitationsanalyse durch...")

# Top zitierte Papers
print("\nTop 10 meist-zitierte Papers:")
top_cited = df.nlargest(10, 'Article Citation Count')
for idx, paper in top_cited.iterrows():
    print(f"\nTitel: {paper['Document Title']}")
    print(f"Zitationen: {paper['Article Citation Count']}")
    print(f"Jahr: {paper['Publication Year']}")
    print(f"Journal: {paper['Publication Title']}")
    print("-" * 100)

# Zitationsentwicklung über Zeit
yearly_citations = df.groupby('Publication Year')['Article Citation Count'].sum().reset_index()

# Visualisierung mit Altair
citation_chart = alt.Chart(yearly_citations).mark_line(point=True).encode(
    x=alt.X('Publication Year:Q',
            scale=alt.Scale(domain=[1994, 2024]),
            axis=alt.Axis(format='d'),
            title='Jahr'),
    y=alt.Y('Article Citation Count:Q',
            title='Gesamtzahl der Zitationen'),
    tooltip=['Publication Year:Q', 'Article Citation Count:Q']
).properties(
    title='Entwicklung der Zitationen über Zeit',
    width=1000,
    height=500
)

# Füge Textlabels hinzu
text = alt.Chart(yearly_citations).mark_text(
    align='center',
    baseline='bottom',
    dy=-10
).encode(
    x='Publication Year:Q',
    y='Article Citation Count:Q',
    text=alt.Text('Article Citation Count:Q', format='.0f')
)

# Kombiniere Chart und Labels
final_citation_chart = (citation_chart + text).configure_axis(
    grid=True,
    gridOpacity=0.3
)
final_citation_chart.save('citation_trends.html')


Führe Zitationsanalyse durch...

Top 10 meist-zitierte Papers:

Titel: Benchmarking Classification Models for Software Defect Prediction: A Proposed Framework and Novel Findings
Zitationen: 849.0
Jahr: 2008
Journal: IEEE Transactions on Software Engineering
----------------------------------------------------------------------------------------------------

Titel: GenProg: A Generic Method for Automatic Software Repair
Zitationen: 718.0
Jahr: 2012
Journal: IEEE Transactions on Software Engineering
----------------------------------------------------------------------------------------------------

Titel: A critique of software defect prediction models
Zitationen: 694.0
Jahr: 1999
Journal: IEEE Transactions on Software Engineering
----------------------------------------------------------------------------------------------------

Titel: Simplifying and isolating failure-inducing input
Zitationen: 670.0
Jahr: 2002
Journal: IEEE Transactions on Software Engineering
---------------------

In [37]:
# 6. Journal Analyse
print("\nAnalysiere führende Journals...")
journal_stats = df.groupby('Publication Title').agg({
    'Document Title': 'count',
    'Article Citation Count': 'sum'
}).reset_index()

journal_stats.columns = ['Journal', 'Anzahl Papers', 'Gesamtzitationen']
journal_stats['Durchschnittliche Zitationen'] = journal_stats['Gesamtzitationen'] / journal_stats['Anzahl Papers']
journal_stats = journal_stats.sort_values('Gesamtzitationen', ascending=False)

# Top 10 Journals visualisieren
top_journals = journal_stats.head(10)
journal_chart = alt.Chart(top_journals).mark_bar().encode(
    y=alt.Y('Journal:N', 
            sort='-x',
            title='Journal'),
    x=alt.X('Gesamtzitationen:Q',
            title='Gesamtzahl der Zitationen'),
    tooltip=['Journal', 'Anzahl Papers', 'Gesamtzitationen', 'Durchschnittliche Zitationen']
).properties(
    title='Top 10 Journals nach Gesamtzitationen',
    width=1000,
    height=400
)

# Füge Werte an den Balken hinzu
text = alt.Chart(top_journals).mark_text(
    align='left',
    baseline='middle',
    dx=3
).encode(
    y='Journal:N',
    x='Gesamtzitationen:Q',
    text=alt.Text('Gesamtzitationen:Q', format='.0f')
)

final_journal_chart = (journal_chart + text).configure_axis(
    grid=True,
    gridOpacity=0.3
)
final_journal_chart.save('journal_analysis.html')

print("\nJournal-Analyse abgeschlossen und gespeichert")


Analysiere führende Journals...

Journal-Analyse abgeschlossen und gespeichert


In [38]:
# 7. Länder- und Institutionsanalyse
print("\nAnalysiere Länder und Institutionen...")

def extract_country(affiliation):
    # Spezifische Länder und ihre Varianten
    common_countries = {
        'USA': ['USA', 'United States', 'U.S.A', 'United States of America', 'United States of'],
        'UK': ['UK', 'United Kingdom', 'England', 'Scotland', 'Wales', 'Britain'],
        'Germany': ['Germany', 'Deutschland'],
        'China': ['China', 'P.R. China', 'People\'s Republic of China'],
        'Canada': ['Canada'],
        'Japan': ['Japan'],
        'India': ['India'],
        'Australia': ['Australia'],
        'France': ['France'],
        'Italy': ['Italy'],
        'Spain': ['Spain'],
        'Netherlands': ['Netherlands', 'The Netherlands'],
        'Switzerland': ['Switzerland'],
        'South Korea': ['South Korea', 'Korea'],
        'Singapore': ['Singapore'],
        'Brazil': ['Brazil'],
        'Sweden': ['Sweden'],
        'Norway': ['Norway'],
        'Denmark': ['Denmark'],
        'Austria': ['Austria']
    }
    
    if pd.isna(affiliation):
        return None
        
    affiliation = str(affiliation).lower()
    for country, variants in common_countries.items():
        if any(variant.lower() in affiliation for variant in variants):
            return country
    return None

def extract_institution(affiliation):
    if pd.isna(affiliation):
        return None
    
    # Erste Institution nehmen (falls mehrere angegeben sind)
    institution = affiliation.split(';')[0].strip()
    
    # Kurze oder leere Institutionsnamen ausfiltern
    if len(institution) < 5:
        return None
        
    return institution

# Länder und Institutionen extrahieren
df['Country'] = df['Author Affiliations'].apply(extract_country)
df['Institution'] = df['Author Affiliations'].apply(extract_institution)

# Länderanalyse
valid_countries = df['Country'].dropna().value_counts().head(10)
country_data = pd.DataFrame({
    'Country': valid_countries.index,
    'Count': valid_countries.values
})

# Länder-Visualisierung
country_chart = alt.Chart(country_data).mark_bar().encode(
    y=alt.Y('Country:N', 
            sort='-x',
            title='Land'),
    x=alt.X('Count:Q',
            title='Anzahl Publikationen'),
    tooltip=['Country:N', 'Count:Q']
).properties(
    title='Top 10 Länder in der KI-Software-Engineering-Forschung',
    width=1000,
    height=400
)

# Text-Labels für Länder
country_text = alt.Chart(country_data).mark_text(
    align='left',
    baseline='middle',
    dx=3
).encode(
    y='Country:N',
    x='Count:Q',
    text=alt.Text('Count:Q', format='.0f')
)

final_country_chart = (country_chart + country_text).configure_axis(
    grid=True,
    gridOpacity=0.3
)
final_country_chart.save('country_analysis.html')


# Statistiken ausgeben
print("\nTop 10 Länder und ihre Publikationszahlen:")
for country, count in valid_countries.items():
    print(f"{country}: {int(count)}")

# Statistiken speichern
country_data.to_csv('country_statistics.csv', index=False)


Analysiere Länder und Institutionen...

Top 10 Länder und ihre Publikationszahlen:
USA: 193
China: 140
UK: 48
Canada: 44
Italy: 32
Germany: 29
Australia: 20
Norway: 11
Switzerland: 10
Spain: 9


In [40]:
# 8. Institutionsanalyse
valid_institutions = df['Institution'].dropna().value_counts().head(10)
institution_data = pd.DataFrame({
    'Institution': valid_institutions.index,
    'Count': valid_institutions.values
})

# Institutions-Visualisierung
institution_chart = alt.Chart(institution_data).mark_bar().encode(
    y=alt.Y('Institution:N', 
            sort='-x',
            title='Institution'),
    x=alt.X('Count:Q',
            title='Anzahl Publikationen'),
    tooltip=['Institution:N', 'Count:Q']
).properties(
    title='Top 10 Institutionen in der KI-Software-Engineering-Forschung',
    width=1000,
    height=400
)

# Text-Labels für Institutionen
institution_text = alt.Chart(institution_data).mark_text(
    align='left',
    baseline='middle',
    dx=3
).encode(
    y='Institution:N',
    x='Count:Q',
    text=alt.Text('Count:Q', format='.0f')
)

final_institution_chart = (institution_chart + institution_text).configure_axis(
    grid=True,
    gridOpacity=0.3
)
final_institution_chart.save('institution_analysis.html')

print("\nTop 10 Institutionen und ihre Publikationszahlen:")
for institution, count in valid_institutions.items():
    print(f"{institution}: {int(count)}")

# Statistiken speichern
institution_data.to_csv('institution_statistics.csv', index=False)
print("\nLändersanalyse abgeschlossen und gespeichert")


Top 10 Institutionen und ihre Publikationszahlen:
State Key Laboratory for Novel Software Technology, Nanjing University, Nanjing, China: 10
Dept. of Computer Science, Iowa State University, Ames, IA, USA: 6
David R. Cheriton School of Computer Science, University of Waterloo, Waterloo, ON, Canada: 4
Simula Research Laboratory, Lysaker, Norway: 4
Singapore Management University, Singapore: 3
Monash University, Melbourne, Australia: 3
Purdue University, West Lafayette, USA: 3
The Chinese University of Hong Kong, Hong Kong, China: 3
Lane Department of Computer Science and Electrical Engineering, West Virginia University, Morgantown, WV, USA: 3
Department of Informatics, University of Oslo, Oslo, Norway: 3

Ländersanalyse abgeschlossen und gespeichert
