In [21]:
import pandas as pd
import folium
from folium.plugins import MarkerCluster
from geopy.geocoders import Nominatim
import time
import json
import os
import re


# Den Pfad zur CSV-Datei definieren (angenommen, sie liegt im 'data'-Ordner deines Projekts)
csv_file_path = os.path.join(os.getcwd(), '..', 'Data', 'test_dataset.csv')

# CSV-Datei einlesen
df = pd.read_csv(csv_file_path)

#-------------------------------------------------------------------------------
# Publikationen der Länder auf einer Geokarte
# Extrahieren der Länder aus den Affiliations und Bereinigung

# Extrahieren der Länder aus den Affiliations und Bereinigung
def clean_country(entry):
    if pd.isna(entry):
        return None
    # Split bei Semikolon, Komma oder Sonderzeichen und nimm den letzten Teil
    entry = re.split(r'[;,\n]', entry)[-1].strip()
    # Entferne zusätzliche Bezeichnungen oder Sonderzeichen
    entry = re.sub(r'[^a-zA-Z\s]', '', entry).strip()
    return entry

df['Country'] = df['Author Affiliations'].apply(clean_country)

# Liste von bekannten "Nicht-Ländern" (Universitäten, Organisationen, etc.)
non_countries = {
    "Microsoft", "Stellar Sciences", "University of Maryland", "University of Minnesota",
    "Carnegie Mellon University", "Editor-in-Chief of IEEE Computer", "Software Productivity Consortium",
    "University of Calgary", "Software Research Associates", "Brainbot Technologies",
    "Royal Military College of Canada", "University of East London", "Institute of Software Technology",
    "Danfoss", "Construx Software", "RWTH Aachen University", "Oakwood Computing Associates",
    "Software Improvement Group", "Software Engineering Management Associates",
    "Blekinge Institute of Technology", "University of Southern California",
    "Ecole Polytechnique de Montréal", "Polytechnique Montréal",
    "LeroThe Irish Software Research Centre", "Computing Trends",
    "ABB", "Vector Consulting Services",
    "AlcatelLucent", "Microsoft Research",
    # Weitere bekannte Organisationen hinzufügen
}

# Entferne offensichtliche Nicht-Länder
df = df[~df['Country'].isin(non_countries)]

# Mapping von Ländernamen in deiner CSV zu GeoJSON-Ländernamen
country_mapping = {
    "USA": "United States of America",
    "United States": "United States of America",
    "UK": "United Kingdom",
    "Great Britain": "United Kingdom",
    "Deutschland": "Germany",
    "S. Korea": "South Korea",
    "PRChina": "China",
    "P.R. China": "China",
    "PR China": "China",
    "Trkiye": "Turkey",
    "Brasil": "Brazil",
    "México": "Mexico",
    "Mxico": "Mexico",
    "xico": "Mexico",
    "Italia": "Italy",
    "España": "Spain",
    "UAE": "United Arab Emirates",
    "Netherlands": "Netherlands",
    "South Korea": "South Korea",
    "Korea": "South Korea",
    "Republic of Korea": "South Korea",
    "Russia": "Russia",
    "Vereinigte Staaten": "United States of America",
    "Turkey": "Turkey",
    "Austria": "Austria",
    "Ireland": "Ireland",
    "Denmark": "Denmark",
    "Sweden": "Sweden",
    "Finland": "Finland",
    "Norway": "Norway",
    "Portugal": "Portugal",
    "Japan": "Japan",
    "Taiwan": "Taiwan",
    "India": "India",
    "Pakistan": "Pakistan",
    "Bangladesh": "Bangladesh",
    "South Africa": "South Africa",
    "Argentina": "Argentina",
    "Uruguay": "Uruguay",
    "Chile": "Chile",
    "Colombia": "Colombia",
    "Brazil": "Brazil",
    "Italy": "Italy",
    "Germany": "Germany",
    "Spain": "Spain",
    "France": "France",
    "United Kingdom": "United Kingdom",
    "United States of America": "United States of America",
    "The Netherlands": "Netherlands",
    "Macau": "China",
    "CA": "Canada",
    "NL": "Netherlands",
    "CN": "China",
    "JP": "Japan",
    "TW": "Taiwan",
    "IN": "India",
    "BR": "Brazil",
    "DE": "Germany",
    "CH": "Switzerland",
    "KSA": "Saudi Arabia",
    # Weitere Zuordnungen hinzufügen
}

# Mapping anwenden
df['Country'] = df['Country'].replace(country_mapping)

# Aggregation der Publikationen pro Land
country_counts = df['Country'].value_counts().reset_index()
country_counts.columns = ['Country', 'Publication Count']

# Entferne leere oder ungültige Einträge
country_counts = country_counts[country_counts['Country'].notna()]

# Lade GeoJSON-Daten für Ländergrenzen
geojson_url = 'https://raw.githubusercontent.com/johan/world.geo.json/master/countries.geo.json'
geojson_data = json.loads(pd.read_json(geojson_url).to_json())

# Debugging: Nicht gemappte Länder prüfen
geojson_countries = [feature['properties']['name'] for feature in geojson_data['features'] if 'properties' in feature and 'name' in feature['properties']]
unmatched_countries = set(country_counts['Country']) - set(geojson_countries)
if unmatched_countries:
    print(f"Folgende Länder konnten nicht gemappt werden: {unmatched_countries}")

# Erstelle die Karte
m = folium.Map(location=[20, 0], zoom_start=2)

# Choroplethenkarte hinzufügen
folium.Choropleth(
    geo_data=geojson_data,
    data=country_counts,
    columns=['Country', 'Publication Count'],
    key_on='feature.properties.name',
    fill_color='YlGnBu',
    fill_opacity=0.7,
    line_opacity=0.2,
    legend_name='Publikationen pro Land',
).add_to(m)

# Karte anzeigen
m

# # Extrahieren der Länder und Universitäten aus den Affiliations
# df['Country'] = df['Author Affiliations'].dropna().apply(lambda x: x.split(',')[-1].strip())
# df['University'] = df['Author Affiliations'].dropna().apply(lambda x: x.split(',')[0].strip())

# # Aggregation der Publikationen pro Land und Universität
# country_counts = df['Country'].value_counts()
# university_counts = df.groupby(['University', 'Country']).size().reset_index(name='Publication Count')

# # Verwende vorbereitete Koordinaten (Beispiel für Länder)
# coordinates_country = {
#     'USA': (37.0902, -95.7129),
#     'Germany': (51.1657, 10.4515),
#     'India': (20.5937, 78.9629),
#     'China': (35.8617, 104.1954),
#     'France': (46.6034, 1.8883),
# }

# # Lade GeoJSON-Daten für Ländergrenzen
# geojson_url = 'https://raw.githubusercontent.com/johan/world.geo.json/master/countries.geo.json'
# geojson_data = json.loads(pd.read_json(geojson_url).to_json())

# # Erstelle eine Geo-Karte mit folium
# m = folium.Map(location=[20, 0], zoom_start=2)  # Weltkarte initialisieren

# # Länder einfärben, die Publikationen haben
# if isinstance(geojson_data, dict) and 'features' in geojson_data:
#     for feature in geojson_data['features']:
#         if 'properties' in feature and 'name' in feature['properties']:
#             country_name = feature['properties']['name']
#             count = country_counts.get(country_name, 0)
#             if count > 0:
#                 folium.GeoJson(
#                     feature,
#                     style_function=lambda x, count=count: {
#                         'fillColor': 'green',
#                         'color': 'black',
#                         'weight': 1,
#                         'fillOpacity': 0.5 if count > 0 else 0
#                     }
#                 ).add_to(m)

# # Hinzufügen von Markern für Universitäten
# marker_cluster = MarkerCluster().add_to(m)
# for _, row in university_counts.iterrows():
#     university = row['University']
#     country = row['Country']
#     count = row['Publication Count']
#     if country in coordinates_country:
#         lat, lon = coordinates_country[country]
#         folium.CircleMarker(
#             location=[lat, lon],
#             radius=5 + (count / university_counts['Publication Count'].max() * 15),  # Punktgröße proportional
#             popup=f"{university}, {country}: {count} Publikationen",
#             color="blue",
#             fill=True,
#             fill_opacity=0.6,
#         ).add_to(marker_cluster)

# # Hinzufügen einer Legende für die Top 10 Länder
# top_10_countries = country_counts.head(10)
# legend_html = f"""
# <div style="position: fixed;
#             bottom: 50px; left: 50px; width: 300px; height: 200px;
#             background-color: white; z-index:9999; font-size:14px;
#             border:2px solid grey; padding: 10px; opacity: 0.8;">
#     <h4 style="margin:0; text-align:center;">Top 10 Länder</h4>
#     <ul style="list-style: none; padding: 0;">
# """
# for i, (country, count) in enumerate(top_10_countries.items(), 1):
#     legend_html += f"<li>{i}. {country}: {count} Publikationen</li>"
# legend_html += "</ul></div>"

# m.get_root().html.add_child(folium.Element(legend_html))

# # Karte direkt in Jupyter Notebook anzeigen
# m

Folgende Länder konnten nicht gemappt werden: {'Romania', 'China', 'Chile', 'Iran', 'Maryland', 'Fairfield University', 'GB', 'South Korea', 'Vancouver', 'Pakistan', 'QC', 'Qatar', 'Montana Tech', 'Irvine', 'Brazil', 'Jordan', 'The University of Mancheste', 'Federal University of Minas Gerais', 'American University of Kuwait', 'Austria', 'Jordan university of science and technology', 'Pontifical Catholic University', 'University of LAquilas', 'cole Polytechnique de Montral', 'Estonia', 'University of Hawaii at Manoa', 'Mexico', 'University of Maryland Baltimore County', 'Israel', 'Carnegie Mellon Software Engineering Institute', 'California', 'Uruguay', 'Colorado Springs', 'Ulm University', 'Dhaka Bangladesh', 'Bulgaria', 'Bavaria', 'Indonesia', 'the Netherlands', 'Keene', 'Kuwait', 'Oman', 'Germany', 'OH', 'GA', 'Bandung', 'Denmark', 'United Arab Emirates', 'javanet', 'S Korea', 'Thailand', 'Spain', 'PK', 'France', 'Ecuador', 'Johannesburg', 'MS group', 'NY', 'United Kingdom', 'India'

AttributeError: 'NoneType' object has no attribute 'get'