In [1]:
import requests
import networkx as nx
import os
import json

In [23]:
def get_subclasses_of(wikidata_id):
    url = 'https://query.wikidata.org/sparql'
    query = '''
    SELECT DISTINCT ?item WHERE {{
      ?item wdt:P279* wd:{} .
    }}
    '''.format(wikidata_id)
    r = requests.get(url, params = {'format': 'json', 'query': query})
    data = r.json()
    subclasses = {item["item"]["value"].lower().split("/")[-1] for item in data["results"]["bindings"]}
    return subclasses

In [3]:
def load_wikidata_id_to_aliases(path):
    id_to_aliases = {}
    with open(path, "r") as f:
        for line in f.readlines():
            line = line.lower().strip()
            parts = line.split("\t")
            wikidata_id = parts[0]
            aliases = parts[1:]
            
            id_to_aliases[wikidata_id] = set()
            for alias in aliases:  
                id_to_aliases[wikidata_id].add(alias)
    
    return id_to_aliases

In [4]:
def load_wikidata_graph(path):
    graph = nx.DiGraph()
    with open(path, "r") as f:
        for line in f.readlines():
            line = line.lower().strip()
            src, rel, dst = line.split("\t")
            if rel == "p31":# or rel == "p279":
                graph.add_node(src)
                graph.add_node(dst)
                graph.add_edge(src, dst, rel=rel)
            
            if rel == "p279":# or rel == "p279":
                graph.add_node(src)
                graph.add_node(dst)
                graph.add_edge(src, dst, rel=rel)
    
    return graph

In [5]:
def save_gazetteers_from_wikidata(output_dir, wikidata_ids, wikidata_id_to_aliases, wikidata_graph):
    
    subclass_to_name = {}
    for label, wiki_ids_and_names in wikidata_ids.items():
        
        output_file_path = os.path.join(output_dir, f"eng-wikidata-{label}.txt")
        with open(output_file_path, "w") as out_file:
        
            for wiki_id, name in wiki_ids_and_names:
                print(f"Writing {label} ({wiki_id}, {name}) to {output_file_path}")

                try:
                    subclass_ids = get_subclasses_of(wiki_id)
                except json.JSONDecodeError:
                    print(f"Request for {label} ({wiki_id}, {name}) timed out.")
                    continue

                added_nodes = set()
                for subclass_id in subclass_ids:
                    if subclass_id not in wikidata_graph:
                        continue

                    if subclass_id not in subclass_to_name:
                        subclass_to_name[subclass_id] = set()

                    subclass_to_name[subclass_id].add(name)

                    edges = nx.bfs_edges(wikidata_graph, subclass_id, reverse=True)

                    nodes = {subclass_id} | {v for u, v in edges}

                    for node in nodes:
                        if node in added_nodes:
                            continue

                        aliases = wikidata_id_to_aliases.get(node, [])

                        for alias in aliases:
                            out_file.write(f"{alias}\t{label}\t{name}\n")

                        added_nodes.add(node)

In [18]:
def save_gazetteers_from_collection(output_dir, entity_types, gazetteers_dir):
    for label, entity_types in entity_types.items():
        
        for entity_type in entity_types:
            
            output_file_path = os.path.join(output_dir, f"eng-hltcoe-{label}.txt")
            with open(output_file_path, "w") as out_file:

                for filename in [f"eng-{entity_type}-name-wd.txt", f"eng-{entity_type}-alias-wd.txt"]:
                    
                    with open(os.path.join(gazetteers_dir, filename)) as in_file:
                        
                        for line in in_file:
                            line = line.lower().strip()
                            
                            if not line:
                                continue

                            assert line, line
                            assert entity_type, entity_type
                            out_file.write(f"{line}\t{label}\t{entity_type}\n")

In [10]:
ENTITY_ALIASES_FILE = "/home/christoph/Downloads/wikidata5m_alias/wikidata5m_entity.txt"
WIKIDATA_GRAPH_TRIPLETS_FILE = "/home/christoph/Downloads/wikidata5m_all_triplet.txt"

GAZETTEER_OUTPUT_DIR = "../data/gazetteers/"

In [8]:
wikidata_id_to_aliases = load_wikidata_id_to_aliases(ENTITY_ALIASES_FILE)
wikidata_graph = load_wikidata_graph(WIKIDATA_GRAPH_TRIPLETS_FILE)

In [25]:
wikidata_ids = {
    # "PROD": [
    #     # ("Q2424752", "product"),  # times out
    #     # ("Q2897903", "goods and services"),  # times out
    # ],
    "CW": [
        ("Q17537576", "creative works"),
        # ("Q7366", "song"),
        # ("Q11424", "film"),
        # ("Q571", "book"),
    ],
    "GRP": [
        ("Q16334295", "group of humans"),
    ],
    "CORP": [
        ("Q167037", "corporation"),
        ("Q4830453", "business"),
    ],
    "PER": [
        ("Q5", "human"),
    ],
    "LOC": [
        ("Q105810946", "physical location"),
        ("Q13226383", "facility"),
        ("Q1048835", "pol terit. entity"),
        ("Q2221906", "geo location"),
    ],
}


save_gazetteers_from_wikidata(
    output_dir=GAZETTEER_OUTPUT_DIR,
    wikidata_ids=wikidata_ids,
    wikidata_id_to_aliases=wikidata_id_to_aliases,
    wikidata_graph=wikidata_graph
)

Writing CW (Q7366, song) to ../data/gazetteers/eng-wikidata-CW.txt
Writing CW (Q11424, film) to ../data/gazetteers/eng-wikidata-CW.txt
Writing CW (Q571, book) to ../data/gazetteers/eng-wikidata-CW.txt


In [19]:
entity_types = {
    "LOC": [
        "LOC",
        "GPE",
        "FAC",
        "GOVT",
        "AIR",
    ],
    "PROD": [
        "COMP",
        "VEH",
        "CHEM",
    ],
    "GRP": [
        "ORG",
        "POL",
    ],
    "PER": [
        "PER",
    ],
    "CORP": [
        "COMM",
    ],
}


save_gazetteers_from_collection(
    output_dir=GAZETTEER_OUTPUT_DIR,
    entity_types=entity_types,
    gazetteers_dir="/home/christoph/Downloads/gazetteers/"
)