https://github.com/explosion/spaCy/blob/9ce059dd067ecc3f097d04023e3cfa0d70d35bb8/spacy/tokens/doc.pyx

https://github.com/explosion/spaCy/blob/f49e2810e6ea5c8b848df5b0f393c27ee31bb7f4/spacy/tokens/span.pyx

In [None]:



def custom_chunks(doclike):
    """
    custom extension for noun_chunks incorporating custom attributes
    
    adapted from noun_chunk extension in spaCy library
    https://github.com/explosion/spaCy/blob/master/spacy/lang/en/syntax_iterators.py
    """
    labels = [
        "nsubj",
        "dobj",
        "nsubjpass",
        "pcomp",
        "pobj",
        "dative",
        "appos",
        "attr",
        "ROOT",
    ]
    doc = doclike.doc  # Ensure works on both Doc and Span.

    np_deps = [doc.vocab.strings.add(label) for label in labels]
    conj = doc.vocab.strings.add("conj")
    np_label = doc.vocab.strings.add("NP")
    prev_end = -1

    predicates = [
            'able', 'available', 'brief', 'certain',
            'different', 'due', 'enough', 'especially', 'few', 'fifth',
            'former', 'his', 'howbeit', 'immediate', 'important', 'inc',
            'its', 'last', 'latter', 'least', 'less', 'likely', 'little',
            'many', 'ml', 'more', 'most', 'much', 'my', 'necessary',
            'new', 'next', 'non', 'old', 'other', 'our', 'ours', 'own',
            'particular', 'past', 'possible', 'present', 'proud', 'recent',
            'same', 'several', 'significant', 'similar', 'some', 'such', 'sup', 'sure'
        ]

    for i, word in enumerate(doclike):
        
        if word.pos_ not in ["NOUN", "PROPN", "PRON"]:
            continue
        
        # Prevent nested chunks from being produced
        if word.left_edge.i <= prev_end:
            continue
        
        if word.dep in np_deps:
            prev_end = word.i
            # print(word.left_edge.i, '=>', prev_end)
            # print("first yield: ", doc[word.left_edge.i: word.i + 1])
            # yield word.left_edge.i, word.i + 1, np_label

            span = Span(doc, word.left_edge.i, word.i + 1)

            counter = 0
            for tok in span:
                if tok.lemma_ in predicates:
                    counter += 1
            
            #remove empty spans, eg the noun_chunk 'others' becomes a zero length span
            if len(span[counter:]) == 0:
                counter = 0

            yield span[counter:]
    
        elif word.dep == conj:
            head = word.head
            while head.dep == conj and head.head.i < head.i:
                head = head.head
            # If the head is an NP, and we're coordinated to it, we're an NP
            if head.dep in np_deps:
                prev_end = word.i
                # print("second yield: ", doc[word.left_edge.i: word.i + 1])
                # yield word.left_edge.i, word.i + 1, np_label
                span = Span(doc, word.left_edge.i, word.i + 1)

                counter = 0
                for tok in span:
                    if tok.lemma_ in predicates:
                        counter += 1
                
                #remove empty spans, eg the noun_chunk 'others' becomes a zero length span
                if len(span[counter:]) == 0:
                    counter = 0
                        
                yield span[counter:]

def merge_named_concepts(doc):

    """Merge named concepts into a single token.
    doc (Doc): The Doc object.
    RETURNS (Doc): The Doc object with merged named concepts.

    Adapts the spacy merge noun chunks function
    code: https://github.com/explosion/spaCy/blob/master/spacy/pipeline/functions.py
    """
    if not doc.is_parsed:
        return doc
    with doc.retokenize() as retokenizer:
        for span in doc._.named_concepts:
            attrs = {
                    "tag": span.root.tag, 
                    "dep": span.root.dep
                    }

            retokenizer.merge(span, attrs=attrs)
    # "_" : {"CONCEPT" : span._.CONCEPT,
    #                         "IDEOLOGY" : span._.IDEOLOGY,
    #                         "ATTRIBUTE" : span._.ATTRIBUTE},
    
    return doc