Take input text:
1. Split into sentences

Steps for each sentence:
1. Find entities
2. Find verbs and create spans
3. For every span, find the closest entity to the left and right
4. Return relation



In [1]:
import spacy
import pathlib
import numpy as np
import pandas as pd
from spacy.matcher import Matcher
#nlp = spacy.load("en_core_web_md")
nlp = spacy.load("en_coreference_web_trf")



In [2]:
def find_verbs(doc):
    matcher = Matcher(nlp.vocab)
    pattern = [[{"POS":"VERB"}]]
    matcher.add("Verbs",pattern)
    matches = matcher(doc.doc)
    verbs = []
    for _,start,end in matches:
        verbs.append(doc.doc[start:end].text)
    return verbs

In [3]:
def longest_span(spans):
    if (len(spans) == 0):
        return None
    sorted_spans = sorted(spans, key=lambda s: len(s), reverse=True)
    return sorted_spans[0]

In [4]:
def create_spans(verbs, doc):
    patterns = [[{"POS": "VERB"}, {"POS": "PART", "OP": "*"}, {"POS": "ADV", "OP": "*"}],
                [{"POS": "VERB"}, {"POS": "ADP", "OP": "*"}, {"POS": "DET", "OP": "*"},
                 {"POS": "AUX", "OP": "*"},
                 {"POS": "ADJ", "OP": "*"}, {"POS": "ADV", "OP": "*"}]]


    matcher = Matcher(nlp.vocab)
    matcher.add("Fluff", patterns)
    matches = matcher(doc)
    spans = []
    for match_id, start, end in matches:

        spans.append(doc.doc[start:end].text)
    res = []
    for verb in verbs:
        verbspans = [span for span in spans if verb in span]
        span = longest_span(verbspans)
        res.append(span)

    return res


In [5]:
def create_relation(span,span_index, entities):

    #Find left
    left_ent = None
    for ent in entities:
        if ent.end_char < span_index:
            if left_ent is None or left_ent.end_char < ent.end_char:
                left_ent = ent
    #Right ent
    right_ent = None
    for ent in entities:
        if ent.start_char > (span_index + len(span)):
            if right_ent is None or right_ent.start_char > ent.start_char:
                right_ent = ent
    relation = (span, left_ent, right_ent)
    return relation


In [6]:
def relation_extraction(doc):
    entities = doc.ents
    print(entities)
    verbs = find_verbs(doc)
    verbspans = create_spans(verbs, doc)
    relations = []
    for span in verbspans:
        span_index = doc.text.index(span)
        relation = create_relation(span,span_index,entities)

        relations.append(relation)

    print(relations)
    print(len(relations))



In [8]:
def main():
    path = pathlib.Path().resolve()



    input_text = """The Yellow Palace (Danish: Det Gule Palæ), or Bergum's Mansion, is an 18th-century town mansion situated at Amaliegade 18, next to Amalienborg Palace, in the Frederiksstaden district of Copenhagen, Denmark. It is considered the first example of Neoclassical architecture in Copenhagen. Originally built as a burgher's home, the mansion was acquired by the Danish Royal Family. Prince Christian of Glücksborg, later to become Christian IX of Denmark, took up residence there, and it became the birthplace of his children Frederick VIII of Denmark, Alexandra, Queen of the United Kingdom, George I of Greece and Maria Feodorovna, Empress of Russia. Today the building is owned by the Danish Palaces and Properties Agency and houses the Lord Chamberlain's Office."""
    clusters = nlp(input_text)
    print(clusters.spans)
    doc = nlp(input_text)
    relation_extraction(doc)
main()

{'coref_clusters_1': [The Yellow Palace (Danish: Det Gule Palæ), or Bergum's Mansion,, It, the mansion, it, the building], 'coref_clusters_2': [Copenhagen, Copenhagen], 'coref_clusters_3': [Denmark, Denmark, Denmark], 'coref_clusters_4': [Prince Christian of Glücksborg, later to become Christian IX of Denmark, his]}
()


ValueError: [E155] The pipeline needs to include a morphologizer or tagger+attribute_ruler in order to use Matcher or PhraseMatcher with the attribute POS. Try using `nlp()` instead of `nlp.make_doc()` or `list(nlp.pipe())` instead of `list(nlp.tokenizer.pipe())`.

In [8]:
#Test
verbs = find_verbs()
verbspans = create_spans(verbs)
span_index = doc.text.index(span)
span_index

TypeError: find_verbs() missing 1 required positional argument: 'doc'

In [53]:
for ent in doc.ents:
    print(ent.start_char)

0
59
83


In [58]:
input = "SS Illinois was an iron passenger-cargo steamship built by William Cramp & Sons in 1873.SS Illinois was an iron passenger-cargo steamship built by William Cramp & Sons in 1873."

In [59]:
doc = nlp(input)

In [65]:
sents = doc.sents

In [33]:
doc.text

NameError: name 'doc' is not defined

In [53]:
input_text = "The Yellow Palace (Danish: Det Gule Palæ), or Bergum's Mansion, is an 18th-century town mansion situated at Amaliegade 18, next to Amalienborg Palace, in the Frederiksstaden district of Copenhagen, Denmark."

doc = nlp(input_text)
ents = doc.ents


In [54]:
for ent in ents:
    print(ent.text,ent.label_)

The Yellow Palace ORG
Danish NORP
Det Gule Palæ PERSON
Bergum's Mansion ORG
18th-century DATE
Amaliegade 18 FAC
Amalienborg Palace ORG
Frederiksstaden PERSON
Copenhagen GPE
Denmark GPE


In [55]:
nlp.get_pipe("ner").labels


('CARDINAL',
 'DATE',
 'EVENT',
 'FAC',
 'GPE',
 'LANGUAGE',
 'LAW',
 'LOC',
 'MONEY',
 'NORP',
 'ORDINAL',
 'ORG',
 'PERCENT',
 'PERSON',
 'PRODUCT',
 'QUANTITY',
 'TIME',
 'WORK_OF_ART')

In [59]:
spacy.explain("ORDINAL")

'"first", "second", etc.'

In [60]:
doc.text

"The Yellow Palace (Danish: Det Gule Palæ), or Bergum's Mansion, is an 18th-century town mansion situated at Amaliegade 18, next to Amalienborg Palace, in the Frederiksstaden district of Copenhagen, Denmark."

In [9]:
import neuralcoref
neuralcoref.add_to_pipe(nlp)

ValueError: spacy.strings.StringStore size changed, may indicate binary incompatibility. Expected 112 from C header, got 64 from PyObject

In [15]:
nlp = spacy.load("en_coreference_web_trf")

ValueError: [E002] Can't find factory for 'transformer' for language English (en). This usually happens when spaCy calls `nlp.create_pipe` with a custom component name that's not registered on the current language class. If you're using a Transformer, make sure to install 'spacy-transformers'. If you're using a custom component, make sure you've added the decorator `@Language.component` (for function components) or `@Language.factory` (for class components).

Available factories: attribute_ruler, tok2vec, merge_noun_chunks, merge_entities, merge_subtokens, token_splitter, doc_cleaner, parser, beam_parser, lemmatizer, trainable_lemmatizer, entity_linker, ner, beam_ner, entity_ruler, tagger, morphologizer, senter, sentencizer, textcat, spancat, future_entity_ruler, span_ruler, textcat_multilabel, en.lemmatizer