In [1]:
# Notwendige Installationen (nur einmal ausführen)
!pip install selenium
!pip install undetected-chromedriver
!pip install webdriver-manager
!pip install openpyxl



In [2]:
import undetected_chromedriver as uc
driver = uc.Chrome(headless=True, use_subprocess=False)

In [3]:
# Importe
import time
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager

In [4]:
# Dateien und Konfiguration
eingabe_datei = "tandfonline_journals.xlsx"
ziel_datei = "tandfonline_journals_final.xlsx"
delay = 5  # Sekunden zwischen Anfragen

# Excel-Datei einlesen
df = pd.read_excel(eingabe_datei)

In [5]:
# Funktion zum Erstellen eines Headless Chrome Browsers
def create_driver():
    options = Options()
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    options.add_argument("--window-size=1920,1080")
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    return driver

In [6]:
# Funktion zum Extrahieren der Metriken von einer Journal-Seite
def extract_metrics_with_selenium(url):
    driver = create_driver()
    try:
        driver.get(url)
        time.sleep(5)  # Zeit zum vollständigen Laden der Seite

        soup = BeautifulSoup(driver.page_source, "html.parser")
        container = soup.find("div", class_="journalMetrics")
        if not container:
            return {"Fehler": "journalMetrics-Container nicht gefunden"}

        metrics = {}

        # Usage-Metrik
        usage_div = container.find("div", class_="usage")
        if usage_div:
            li = usage_div.find("li")
            if li:
                strong_tag = li.find("strong")
                if strong_tag:
                    value = strong_tag.get_text(strip=True)
                    key = li.get_text(strip=True).replace(value, "").strip()
                    metrics[key] = value
                else:
                    metrics["Usage"] = li.get_text(strip=True)
        # Citation-Metriken
        citation_div = container.find("div", class_="citation-metrics")
        if citation_div:
            for li in citation_div.find_all("li"):
                strong_tag = li.find("strong")
                if strong_tag:
                    value = strong_tag.get_text(strip=True)
                    key = li.get_text(strip=True).replace(value, "").strip()
                    metrics[key] = value
                else:
                    metrics["Citation"] = li.get_text(strip=True)

        # Speed/Acceptance-Metriken
        speed_div = container.find("div", class_="speed")
        if speed_div:
            for li in speed_div.find_all("li"):
                strong_tag = li.find("strong")
                if strong_tag:
                    value = strong_tag.get_text(strip=True)
                    key = li.get_text(strip=True).replace(value, "").strip()
                    metrics[key] = value
                else:
                    metrics["Speed/Acceptance"] = li.get_text(strip=True)

        return metrics

    except Exception as e:
        return {"Fehler": str(e)}

    finally:
        driver.quit()

In [7]:
# Alle URLs verarbeiten und Metriken sammeln
alle_metrics = []

for i, row in df.iterrows():
    url = row.get("URL")
    print(f"[{i+1}/{len(df)}] Scraping: {url}")
    metrics = extract_metrics_with_selenium(url)
    alle_metrics.append(metrics)
    print(f"Fertig mit URL {i+1}. Warte {delay} Sekunden...\n")
    time.sleep(delay)

[1/6] Scraping: https://www.tandfonline.com/journals/rdsp20/about-this-journal#journal-metrics
Fertig mit URL 1. Warte 5 Sekunden...

[2/6] Scraping: https://www.tandfonline.com/journals/ceps20/about-this-journal#journal-metrics
Fertig mit URL 2. Warte 5 Sekunden...

[3/6] Scraping: https://www.tandfonline.com/journals/cjoe20/about-this-journal#journal-metrics
Fertig mit URL 3. Warte 5 Sekunden...

[4/6] Scraping: https://www.tandfonline.com/journals/nens20/about-this-journal#journal-metrics
Fertig mit URL 4. Warte 5 Sekunden...

[5/6] Scraping: https://www.tandfonline.com/journals/rptp20/about-this-journal#journal-metrics
Fertig mit URL 5. Warte 5 Sekunden...

[6/6] Scraping: https://www.tandfonline.com/journals/tsus20/about-this-journal#journal-metrics
Fertig mit URL 6. Warte 5 Sekunden...



In [8]:
# Ergebnisse speichern
metrics_df = pd.DataFrame(alle_metrics)
df_out = pd.concat([df, metrics_df], axis=1)
df_out.to_excel(ziel_datei, index=False)

print(f"Fertig! Datei gespeichert unter: {ziel_datei}")

Fertig! Datei gespeichert unter: tandfonline_journals_final.xlsx
