Take input text:
1. Split into sentences

Steps for each sentence:
1. Find entities
2. Find verbs and create spans
3. For every span, find the closest entity to the left and right
4. Return relation



In [23]:
import spacy
import pathlib
import numpy as np
import pandas as pd
from spacy.matcher import Matcher
nlp = spacy.load("en_core_web_md")


In [24]:
def find_verbs(doc):
    matcher = Matcher(nlp.vocab)
    pattern = [[{"POS":"VERB"}]]
    matcher.add("Verbs",pattern)
    matches = matcher(doc.doc)
    verbs = []
    for _,start,end in matches:
        verbs.append(doc.doc[start:end].text)
    return verbs

In [25]:
def longest_span(spans):
    if (len(spans) == 0):
        return None
    sorted_spans = sorted(spans, key=lambda s: len(s), reverse=True)
    return sorted_spans[0]

In [26]:
def create_spans(verbs, doc):
    patterns = [[{"POS": "VERB"}, {"POS": "PART", "OP": "*"}, {"POS": "ADV", "OP": "*"}],
                [{"POS": "VERB"}, {"POS": "ADP", "OP": "*"}, {"POS": "DET", "OP": "*"},
                 {"POS": "AUX", "OP": "*"},
                 {"POS": "ADJ", "OP": "*"}, {"POS": "ADV", "OP": "*"}]]


    matcher = Matcher(nlp.vocab)
    matcher.add("Fluff", patterns)
    matches = matcher(doc)
    spans = []
    for match_id, start, end in matches:

        spans.append(doc.doc[start:end].text)
    res = []
    for verb in verbs:
        verbspans = [span for span in spans if verb in span]
        span = longest_span(verbspans)
        res.append(span)

    return res


In [27]:
def create_relation(span,span_index, entities):

    #Find left
    left_ent = None
    for ent in entities:
        if ent.end_char < span_index:
            if left_ent is None or left_ent.end_char < ent.end_char:
                left_ent = ent
    #Right ent
    right_ent = None
    for ent in entities:
        if ent.start_char > (span_index + len(span)):
            if right_ent is None or right_ent.start_char > ent.start_char:
                right_ent = ent
    relation = (span, left_ent, right_ent)
    return relation


In [28]:
def relation_extraction(doc):
    entities = doc.ents
    verbs = find_verbs(doc)
    verbspans = create_spans(verbs, doc)
    relations = []
    for span in verbspans:
        span_index = doc.text.index(span)
        relation = create_relation(span,span_index,entities)

        relations.append(relation)

    print(relations)
    print(len(relations))



In [37]:
def main():
    path = pathlib.Path().resolve()



    input_text = "SS Illinois was an iron passenger-cargo steamship built by William Cramp & Sons in 1873."
    doc = nlp(input_text)
    for sent in doc.sents:

        print()
        relation_extraction(sent)
main()

/Users/dr/Documents/web_processing/assignment-code/nlp

[('built by', SS Illinois, William Cramp & Sons)]
1


In [50]:
#Test
verbs = find_verbs()
verbspans = create_spans(verbs)
span_index = doc.text.index(span)
span_index

50

In [53]:
for ent in doc.ents:
    print(ent.start_char)

0
59
83


In [58]:
input = "SS Illinois was an iron passenger-cargo steamship built by William Cramp & Sons in 1873.SS Illinois was an iron passenger-cargo steamship built by William Cramp & Sons in 1873."

In [59]:
doc = nlp(input)

In [65]:
sents = doc.sents

In [33]:
doc.text

NameError: name 'doc' is not defined