In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import networkx as nx

from pytorch_ie.data.datasets.multiconer import load_multiconer

In [3]:
MULTI_CONER_DIR = "/home/christoph/Downloads/training_data/"

ENTITY_ALIASES_FILE = "/home/christoph/Downloads/wikidata5m_alias/wikidata5m_entity.txt"
WIKIDATA_TRIPLETS_FILE = "/home/christoph/Downloads/wikidata5m_all_triplet.txt"

ALL_SUBCLASSES_CW_FILE = "./all_subclasses_cv.txt"
ALL_SUBCLASSES_GRP_FILE = "./all_subclasses_grp.txt"
ALL_SUBCLASSES_ORG_FILE = "./all_subclasses_org.txt"
ALL_SUBCLASSES_CORP_FILE = "./all_subclasses_corp.txt"
ALL_SUBCLASSES_BUSINESS_FILE = "./all_subclasses_business.txt"

In [10]:
def load_wikidata_entity_aliases(path):
    aliases = {}
    with open(path, "r") as f:
        for line in f.readlines():
            line = line.lower().strip()
            parts = line.split("\t")
            wikidata_id = parts[0]
            alias = parts[1:]
            for a in alias:  
                if a not in aliases:
                    aliases[a] = set()
                aliases[a].add(wikidata_id)
    return aliases

In [11]:
def load_wikidata_graph(path):
    G = nx.DiGraph()
    with open(path, "r") as f:
        for line in f.readlines():
            line = line.lower().strip()
            src, rel, dst = line.split("\t")
            if rel == "p31":# or rel == "p279":
                G.add_node(src)
                G.add_node(dst)
                G.add_edge(src, dst, rel=rel)
    
    return G

In [16]:
def load_ids_from_txt_file(path):
    ids = []
    with open(path, "r") as f:
        for line in f.readlines():
            line = line.strip()
            ids.append(line)
    return ids

In [23]:
entity_to_id = load_wikidata_entity_aliases(ENTITY_ALIASES_FILE)

In [25]:
wikidata_graph = load_wikidata_graph(WIKIDATA_TRIPLETS_FILE)

In [26]:
subclasses_cw = load_ids_from_txt_file(ALL_SUBCLASSES_CW_FILE)
subclasses_grp = load_ids_from_txt_file(ALL_SUBCLASSES_GRP_FILE)
subclasses_org = load_ids_from_txt_file(ALL_SUBCLASSES_ORG_FILE)
subclasses_corp = load_ids_from_txt_file(ALL_SUBCLASSES_CORP_FILE)
subclasses_business = load_ids_from_txt_file(ALL_SUBCLASSES_BUSINESS_FILE)

In [4]:
val_docs = load_multiconer(
    data_dir=MULTI_CONER_DIR,
    name="en",
    split="validation",
)

Using custom data configuration en-6574638273c3bd25
Reusing dataset multi_co_ner (/home/christoph/.cache/huggingface/datasets/multi_co_ner/en-6574638273c3bd25/1.0.0/3964f6ebcc5b795608a3f4d09c084107dad977c625a68c59b657c65dd5312647)


In [13]:
len(train_docs)

15300

In [27]:
for doc in train_docs[:50]:
    print(doc.text)
    print("*" * 25)
    entities = doc.annotations("entities")
    for entity in entities:
        
        entity_text = doc.text[entity.start: entity.end]
        
        wd_ids = entity_to_id.get(entity_text, {})
        
        gazetteer = []
        is_cw_subclass = False
        for wd_id in wd_ids:
            if wd_id not in wikidata_graph:
                continue
            
            node = wikidata_graph[wd_id]
            
            successors = [succ for succ in wikidata_graph.successors(wd_id)] + [wd_id]
            # print(wd_id, successors)
            
            if any([k in subclasses_cw for k in successors]):
                is_cw_subclass = True
                gazetteer.append("CW")
                break
        
        is_grp_subclass = False
        for wd_id in wd_ids:
            if wd_id not in wikidata_graph:
                continue
            
            node = wikidata_graph[wd_id]
            
            if any([k in subclasses_corp or k in subclasses_business for k in successors]):
                is_grp_subclass = True
                gazetteer.append("CORP")
                break
            else:
                if any([k in subclasses_grp for k in successors]):
                    is_grp_subclass = True
                    gazetteer.append("GRP")
                    break
        
        # is_cw_subclass = any([any([k in subclasses_cw for k in wikidata_graph[wd_id].keys()]) for wd_id in wd_ids])
        print(entity_text, entity.label, "-->", gazetteer)#, f"cw={is_cw_subclass}, grp={is_grp_subclass}")
    print("=" * 100)
    print()

his playlist includes sonny sharrock , gza , country teasers and the notorious b.i.g.
*************************
the notorious b.i.g. PER --> []
country teasers GRP --> []
gza PER --> []
sonny sharrock PER --> []

it is a series of badminton tournaments , sanctioned by badminton world federation ( bwf ) since 2007 .
*************************
badminton world federation GRP --> []

all songs written by m.o.d. , unless otherwise stated
*************************
m.o.d. GRP --> ['GRP']

he worked in a bookstore before becoming a journalist , first for le devoir , and then for cité libre , for which he later became the director .
*************************
le devoir GRP --> ['CW']
bookstore GRP --> []
cité libre CW --> ['CW']

kingdom hospital , lewiston from stephen king miniseries of the same name
*************************
lewiston LOC --> []
kingdom hospital CW --> ['CW']
stephen king PER --> []

rinder did not speak on camera because he promised his first interview to the bbc .
***********

In [79]:
list(wikidata_graph.successors("q126638"))

['q1346006']

In [96]:
wikidata_graph["q35127"]

AtlasView({})

In [94]:
entity_to_id["rotten tomatoes"]

{'q105584'}

In [6]:
from pytorch_ie.models.transformer_span_classification_entity import WikiDataGraphGazetteer

ImportError: cannot import name 'WikiDataGraphGazetteer' from 'pytorch_ie.models.transformer_span_classification_entity' (/home/christoph/Projects/research/pytorch-ie/pytorch_ie/models/transformer_span_classification_entity.py)

In [2]:
#g = Gazetteer(path="/home/christoph/Downloads/gazetteers")

In [26]:
g = WikiDataGraphGazetteer(subclasses_path="./", wikidata_graph_path=WIKIDATA_TRIPLETS_FILE, wikidata_entity_aliases_path=ENTITY_ALIASES_FILE)

In [27]:
for doc in train_docs[:50]:
    print(doc.text)
    print("*" * 25)
    entities = doc.annotations("entities")
    for entity in entities:
        
        entity_text = doc.text[entity.start: entity.end]
        
        gazetteer = g.lookup(entity_text)
        
        # is_cw_subclass = any([any([k in subclasses_cw for k in wikidata_graph[wd_id].keys()]) for wd_id in wd_ids])
        print(entity_text, entity.label, "-->", gazetteer)#, f"cw={is_cw_subclass}, grp={is_grp_subclass}")
    print("=" * 100)
    print()

his playlist includes sonny sharrock , gza , country teasers and the notorious b.i.g.
*************************
the notorious b.i.g. PER --> []
sonny sharrock PER --> []
gza PER --> []
country teasers GRP --> []

it is a series of badminton tournaments , sanctioned by badminton world federation ( bwf ) since 2007 .
*************************
badminton world federation GRP --> []

all songs written by m.o.d. , unless otherwise stated
*************************
m.o.d. GRP --> ['GRP']

he worked in a bookstore before becoming a journalist , first for le devoir , and then for cité libre , for which he later became the director .
*************************
le devoir GRP --> ['CW']
bookstore GRP --> []
cité libre CW --> ['CW']

kingdom hospital , lewiston from stephen king miniseries of the same name
*************************
stephen king PER --> []
kingdom hospital CW --> ['CW']
lewiston LOC --> []

rinder did not speak on camera because he promised his first interview to the bbc .
***********

In [18]:
subclasses_cw

['q5803607',
 'q100969249',
 'q682626',
 'q11496089',
 'q7511905',
 'q2113720',
 'q55659881',
 'q7315445',
 'q3847997',
 'q3042412',
 'q861911',
 'q9051280',
 'q712274',
 'q104146008',
 'q15841241',
 'q40800432',
 'q54502633',
 'q368281',
 'q2868579',
 'q16525012',
 'q107215170',
 'q75122566',
 'q1062722',
 'q13136212',
 'q110110631',
 'q2304946',
 'q75113060',
 'q6906613',
 'q51879670',
 'q74154043',
 'q11387074',
 'q96252740',
 'q33120867',
 'q1264695',
 'q107423527',
 'q9384895',
 'q3297601',
 'q898583',
 'q97359653',
 'q100973394',
 'q29382606',
 'q67035425',
 'q400185',
 'q12299961',
 'q2328600',
 'q58663',
 'q32861734',
 'q25590632',
 'q1862315',
 'q201884',
 'q85299672',
 'q16739336',
 'q92162007',
 'q3487109',
 'q69909183',
 'q11424431',
 'q948970',
 'q1287199',
 'q110135832',
 'q2423654',
 'q4155843',
 'q1271310',
 'q1207416',
 'q99772086',
 'q12987256',
 'q388197',
 'q28840786',
 'q1762817',
 'q20588567',
 'q100974012',
 'q1545589',
 'q766217',
 'q40248002',
 'q107294498',
 '