In [2]:
import pandas as pd
import folium
from folium.plugins import MarkerCluster
from geopy.geocoders import Nominatim
import time
import json
import os
import re


# Den Pfad zur CSV-Datei definieren (angenommen, sie liegt im 'data'-Ordner deines Projekts)
csv_file_path = os.path.join(os.getcwd(), '..', 'Data', 'Dataset.csv')

# CSV-Datei einlesen
df = pd.read_csv(csv_file_path)

#-------------------------------------------------------------------------------
# Publikationen der Länder auf einer Geokarte
# Extrahieren der Länder aus den Affiliations und Bereinigung

# Extrahieren der Länder aus den Affiliations und Bereinigung
def clean_country(entry):
    if pd.isna(entry):
        return None
    # Split bei Semikolon, Komma oder Sonderzeichen und nimm den letzten Teil
    entry = re.split(r'[;,\n]', entry)[-1].strip()
    # Entferne zusätzliche Bezeichnungen oder Sonderzeichen
    entry = re.sub(r'[^a-zA-Z\s]', '', entry).strip()
    return entry

df['Country'] = df['Author Affiliations'].apply(clean_country)

# Liste von bekannten "Nicht-Ländern" (Universitäten, Organisationen, etc.)
non_countries = {
    "Microsoft", "Stellar Sciences", "University of Maryland", "University of Minnesota",
    "Carnegie Mellon University", "Editor-in-Chief of IEEE Computer", "Software Productivity Consortium",
    "University of Calgary", "Software Research Associates", "Brainbot Technologies",
    "Royal Military College of Canada", "University of East London", "Institute of Software Technology",
    "Danfoss", "Construx Software", "RWTH Aachen University", "Oakwood Computing Associates",
    "Software Improvement Group", "Software Engineering Management Associates",
    "Blekinge Institute of Technology", "University of Southern California",
    "Ecole Polytechnique de Montréal", "Polytechnique Montréal",
    "LeroThe Irish Software Research Centre", "Computing Trends",
    "ABB", "Vector Consulting Services",
    "AlcatelLucent", "Microsoft Research",
    # Weitere bekannte Organisationen hinzufügen
}

# Entferne offensichtliche Nicht-Länder
df = df[~df['Country'].isin(non_countries)]

# Mapping von Ländernamen in deiner CSV zu GeoJSON-Ländernamen
country_mapping = {
    "USA": "United States of America",
    "United States": "United States of America",
    "UK": "United Kingdom",
    "Great Britain": "United Kingdom",
    "Deutschland": "Germany",
    "S. Korea": "South Korea",
    "PRChina": "China",
    "P.R. China": "China",
    "PR China": "China",
    "Trkiye": "Turkey",
    "Brasil": "Brazil",
    "México": "Mexico",
    "Mxico": "Mexico",
    "xico": "Mexico",
    "Italia": "Italy",
    "España": "Spain",
    "UAE": "United Arab Emirates",
    "Netherlands": "Netherlands",
    "South Korea": "South Korea",
    "Korea": "South Korea",
    "Republic of Korea": "South Korea",
    "Russia": "Russia",
    "Vereinigte Staaten": "United States of America",
    "Turkey": "Turkey",
    "Austria": "Austria",
    "Ireland": "Ireland",
    "Denmark": "Denmark",
    "Sweden": "Sweden",
    "Finland": "Finland",
    "Norway": "Norway",
    "Portugal": "Portugal",
    "Japan": "Japan",
    "Taiwan": "Taiwan",
    "India": "India",
    "Pakistan": "Pakistan",
    "Bangladesh": "Bangladesh",
    "South Africa": "South Africa",
    "Argentina": "Argentina",
    "Uruguay": "Uruguay",
    "Chile": "Chile",
    "Colombia": "Colombia",
    "Brazil": "Brazil",
    "Italy": "Italy",
    "Germany": "Germany",
    "Spain": "Spain",
    "France": "France",
    "United Kingdom": "United Kingdom",
    "United States of America": "United States of America",
    "The Netherlands": "Netherlands",
    "Macau": "China",
    "CA": "Canada",
    "NL": "Netherlands",
    "CN": "China",
    "JP": "Japan",
    "TW": "Taiwan",
    "IN": "India",
    "BR": "Brazil",
    "DE": "Germany",
    "CH": "Switzerland",
    "KSA": "Saudi Arabia",
    # Weitere Zuordnungen hinzufügen
}

# Mapping anwenden
df['Country'] = df['Country'].replace(country_mapping)

# Aggregation der Publikationen pro Land
country_counts = df['Country'].value_counts().reset_index()
country_counts.columns = ['Country', 'Publication Count']

# Entferne leere oder ungültige Einträge
country_counts = country_counts[country_counts['Country'].notna()]

# Lade GeoJSON-Daten für Ländergrenzen
geojson_url = 'https://raw.githubusercontent.com/johan/world.geo.json/master/countries.geo.json'
geojson_data = json.loads(pd.read_json(geojson_url).to_json())

# Debugging: Nicht gemappte Länder prüfen
geojson_countries = [feature['properties']['name'] for feature in geojson_data['features'] if 'properties' in feature and 'name' in feature['properties']]
unmatched_countries = set(country_counts['Country']) - set(geojson_countries)
if unmatched_countries:
    print(f"Folgende Länder konnten nicht gemappt werden: {unmatched_countries}")

# Erstelle die Karte
m = folium.Map(location=[20, 0], zoom_start=2)

# Choroplethenkarte hinzufügen
folium.Choropleth(
    geo_data=geojson_data,
    data=country_counts,
    columns=['Country', 'Publication Count'],
    key_on='feature.properties.name',
    fill_color='YlGnBu',
    fill_opacity=0.7,
    line_opacity=0.2,
    legend_name='Publikationen pro Land',
).add_to(m)

# Karte anzeigen
m

Folgende Länder konnten nicht gemappt werden: {'', 'Kuwait', 'Germany', 'KY', 'Leipzig', 'Oregon', 'Italy', 'Monash University', 'Santa Barbara', 'East China Normal University', 'Valencia', 'Czech Republic', 'School of Electrical Engineering and Computer Science at Oregon State University', 'Atlanta', 'Beijing', 'Belgium', 'University of Chicago', 'Cambridge', 'France', 'Spain', 'Vietnam', 'Riverside', 'The Ohio State University', 'UCAS', 'Qatar', 'Slovenia', 'Chile', 'Texas', 'University Ave', 'Raleigh', 'Zagreb', 'Nanjing', 'NE', 'UrbanaChampaign', 'WA', 'Taiwan', 'Australia', 'City University of Hong Kong', 'Norway', 'Shandong University', 'MI', 'Korea University', 'George Mason University', 'Montbonnot SaintMartin', 'The Chinese University of Hong Kong', 'China', 'NJ', 'MA', 'The University of Texas at Dallas', 'Queens University', 'AL', 'NC', 'Estonia', 'Johannes Kepler University Linz', 'Finland', 'School of Cyber Security', 'National University of Defense Technology', 'Author Af

AttributeError: 'NoneType' object has no attribute 'get'