In [16]:
import csv, time, os
import pandas as pd
from typing import Optional, Dict
from functools import lru_cache  # CACHE

In [17]:
from find_qid import find_qid_by_orcid
from find_qid import _api_get

In [18]:
"""
Searches for the Wikidata QID of a given label (name), optionally language-specific.
"""

@lru_cache(maxsize=None)  # API/cache
def find_qid_by_name(name: str, lang: str = "en") -> Optional[str]:
    # Abort directly if input is empty
    if not name:
        return None

    # Send API request to wbsearchentities endpoint (Wikidata search)
    data = _api_get(
        {
            "action": "wbsearchentities",
            "search": name,
            "language": lang,
            "type": "item",
            "limit": 1,
            "format": "json"
        }
    )
    try:
        # Return the Q-ID of the first result (e.g. "Q123456")
        return data["search"][0]["id"]
    except (KeyError, IndexError):
        # No result or incomplete response → return None
        return None

In [19]:
"""
Searches for a Wikidata QID for an institution by its label.
Results are cached locally.
"""

# Simple cache: institution label → QID (or None if not found)
inst_cache: Dict[str, Optional[str]] = {}

def find_qid_by_institution_label(label: str) -> Optional[str]:  # API
    # Abort if no input
    if not label:
        return None

    # Return from cache if already present
    if label in inst_cache:
        return inst_cache[label]

    # Search Wikidata by label – first in English, then in German
    for lang in ("en", "de"):
        data = _api_get({
            "action": "wbsearchentities", "search": label, "language": lang,
            "type": "item", "limit": 1, "format": "json"})

        # If match found → extract and cache QID
        if data.get("search"):
            qid = data["search"][0]["id"]
            inst_cache[label] = qid

            # Optional info output if German label was used
            if lang == "de":
                print(f"[info] Institution '{label}' found via German label → {qid}")
            return qid

    # No match in either language → cache with None
    inst_cache[label] = None
    return None

In [41]:
def file_to_qs(infile: str, outfile: str) -> None:
    # Determine file extension (xls/xlsx or csv)
    ext = os.path.splitext(infile)[1].lower()

    # Read input file depending on format
    df = pd.read_excel(infile) if ext in {".xlsx", ".xls"} else pd.read_csv(infile)

    # Check if all required columns are present
    required = {"Name", "Institution", "ORCID", "ORCID-Link"}
    missing = required - set(df.columns)
    if missing:
        raise ValueError(f"Missing columns: {', '.join(sorted(missing))}")

    # Initialize result list and deduplication tracker
    rows = []
    processed = set()

    # Iterate through all rows of input file
    for _, r in df.iterrows():
        name = str(r["Name"]).strip()

        # If ORCID is NaN, treat it as empty string
        orcid = str(r["ORCID"]).strip() if pd.notna(r["ORCID"]) else ""

        # Deduplicate by name + ORCID (lowercased)
        key = (name.lower(), orcid)
        if key in processed:
            continue
        processed.add(key)

        # Prepare institution and URL
        inst_label = str(r["Institution"]).strip()
        url = r["ORCID-Link"] if pd.notna(r["ORCID-Link"]) else ""

        # Check if person already exists (via ORCID or name)
        qid = find_qid_by_orcid(orcid) or find_qid_by_name(name)
        if qid:
            print(f"[skip] {name} already exists as {qid}")
            continue

        # Try to find institution QID
        inst_qid = find_qid_by_institution_label(inst_label)
        if not inst_qid:
            print(f"[warn] Institution '{inst_label}' not found ⇒ skipped")
            continue

        # Build QuickStatements row
        rows.append({
            "qid": "CREATE",
            "Len": name,
            "P31": "Q5",          # instance of → human
            "P496": orcid,        # ORCID
            "S854": url,          # source (URL)
            "P108": inst_qid,     # employer/affiliation
        })

        # Short pause to avoid overloading the API
        time.sleep(0.1)

    # If no new rows → skip export
    if not rows:
        print("No new items – nothing exported.")
        return

    # Write QuickStatements file in CSV format
    field_order = ["qid", "Len", "P31", "P496", "S854", "P108"]
    with open(outfile, "w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=field_order)
        writer.writeheader()
        writer.writerows(rows)

    # Success message with row count
    print(f"✓ {len(rows)} QuickStatements rows → {outfile}")


In [44]:
# Path to input file with people, institutions, and ORCID info
csv_input_path = "../outputs/input_with_orcid.csv"

# Path to output file for generated QuickStatements in CSV format
csv_output_path = "../outputs/qs_main_items.csv"

# Start processing: check existing QIDs and create new QS rows
file_to_qs(csv_input_path, csv_output_path)

In [45]:
file_to_qs(csv_input_path, csv_output_path)

[skip] Alexander Sczyrba existiert bereits als Q30420936
[skip] Jens Stoye existiert bereits als Q89498719
[skip] Michael Beckstette existiert bereits als Q114411617
[skip] Liren Huang existiert bereits als Q114780829
[skip] Sebastian Jünemann existiert bereits als Q56948964
[skip] Kassian Kobert existiert bereits als Q133094637
[skip] Anandhi Iyappan existiert bereits als Q59196905
[skip] Peer Bork existiert bereits als Q7160367
[skip] Sarah Schulz existiert bereits als Q65162179
[skip] Daniel Podlesny existiert bereits als Q133331882
[skip] Manja Marz existiert bereits als Q87730329
[skip] Winfried Göttsch existiert bereits als Q44200631
[skip] Anderson Santos existiert bereits als Q39510481
[skip] Ulisses Nunes da Rocha existiert bereits als Q47007256
[skip] Martin Bole existiert bereits als Q102304978
[skip] Adrian Fritz existiert bereits als Q133333363
[skip] Alice McHardy existiert bereits als Q2646932
[skip] Mattea Müller existiert bereits als Q56957915
[skip] Fernando Meyer exi

In [46]:
df = pd.read_csv(csv_output_path)
# Export only the ORCID column for further processing
orcid_column = df["P496"]
orcid_column = orcid_column.to_frame("orcid")
orcid_column.to_csv("../outputs/orcid_only.csv", index=False)
print("✓ ORCID list was exported")

✓ ORCID-Liste wurde exportiert


In [25]:
# Test call (commented out)
# orcid = "0000-0002-1481-2996"
# data = fetch_orcid_sections(orcid)
# print(data)