# Custom Noun Chunking
-----

There is a problem whereby spaCy's inbuilt noun_chunks is too greedy for the chunking required for detecting the ingroups and outgroups.

There are several examples in the test ingroup and outgroup sentences named entities are chunked with other nouns when they would preferable be kept separate.

There are also several examples where a noun chunk contains more than one noun of a custom attribute, therefore, the chunk needs to be resolved to a single instance

This notebook adapt spaCy's noun_chunk source code and adapt for the specific purpose of this pipeline.

Source code at these links:

    https://github.com/explosion/spaCy/blob/9ce059dd067ecc3f097d04023e3cfa0d70d35bb8/spacy/tokens/doc.pyx

    https://github.com/explosion/spaCy/blob/f49e2810e6ea5c8b848df5b0f393c27ee31bb7f4/spacy/tokens/span.pyx


## Test data

the following sentences will be used for this notebook

In [11]:
## create a dict object of all the ingroup/outgroup sentences
import os
import cndutils as ut
path = r"C:\Users\Steve\OneDrive - University of Southampton\CNDPipeline\dataset"

sent_dict = dict()
jsonl_files = [f for f in os.listdir(path) if os.path.splitext(f)[1] == ".jsonl"]
for file in jsonl_files:
    data_list = ut.load_jsonl(os.path.join(path, file))
    for entry in data_list:
        for value in entry.values():
            sent_dict[len(sent_dict)] = value

Loaded 49 records from C:\Users\Steve\OneDrive - University of Southampton\CNDPipeline\dataset\binladen_ingroup_sents.jsonl
Loaded 101 records from C:\Users\Steve\OneDrive - University of Southampton\CNDPipeline\dataset\binladen_outgroup_sents.jsonl
Loaded 66 records from C:\Users\Steve\OneDrive - University of Southampton\CNDPipeline\dataset\bush_ingroup_sents.jsonl
Loaded 37 records from C:\Users\Steve\OneDrive - University of Southampton\CNDPipeline\dataset\bush_outgroup_sents.jsonl


In [12]:
%%time

import spacy
import pipeline
cnd = pipeline.CND()

merge_nps = cnd.nlp.create_pipe("merge_noun_chunks")
cnd.nlp.add_pipe(merge_nps)

Wall time: 21.2 s


In [76]:
print(cnd.nlp.meta['name'])

core_web_md


In [73]:
import pandas as pd
import os
import jsonlines
from spacy import displacy
from visuals import sent_frame
from IPython.display import clear_output

def sent_select(nlp, input_dict, parse = False, path = None, file = None):
    
    """
    function to iterate through an input dict and select values for an output list in jsonl format
    input:
    - input_dict = {"int" : "str"}
    - output_list = [{"int" : str} ... {"n" : "str"}]
    """

    output_list = []
    index = 0
    jsonl_file_type = ".jsonl"
    file_jsonl = None
    test = False
    
    # if a path a string are passed then create a target filename
    if path is not None and isinstance(file, str):
        
        file_jsonl = ''.join((file, jsonl_file_type))

    # iterate over input_dict until completion
    while index < len(input_dict):
        
        # if a filename was passed then write to file
        if file_jsonl is not None:
            with jsonlines.open(os.path.join(path, file_jsonl), 'w') as writer:
                writer.write_all(output_list)
                
        # test for whether the latest entries in output_list and input_dict are equal
        if len(output_list) > 0 and list(output_list[-1].values())[0] == input_dict[index - 1]:
            test = True
        else:
            test = False

        # clear screen
        clear_output(wait=True)

        # show progress through input_dict
        print(f'{index} / {len(input_dict)}')

        # get text
        text = input_dict[index]

        # parse text
        doc = nlp(text)

        # if the option to show the dependency parse is passed display it
        if parse == True:
            displacy.render(doc, style="dep")

        # display the sentence frame in compact form
        display(sent_frame(doc))

        # get choice
        choice = input("add to test_sents (y), delete previous (d), quit (q), back (b)").lower()

        # if the choice is y then add sentence text to output_list and continue iteration
        if choice == "y":

            output_list.append({len(output_list) : text})

            index += 1

        # if the choice is d and the last entries for input_dict and output_list are matching
        # then delete the last item from output_list go back by one step, otherwise ignore
        elif choice == "d":
            if test:
                print(f'removing: {output_list.pop()}')
                index -= 1
                input()
            else:
                continue
                
        # if the choice is b and the last entries for input_dict and output_list are not matching
        # then go back until they are matching, whereup delete is required
        elif choice == "b":
            if not test:
                index -= 1
    
        # if the choice is q then quit
        elif choice == "q":
            break
    
        # if choice is none of the above then continue iteration
        else:
            index += 1
    
    if file_jsonl is not None:
        print(f'written {len(output_list)} entries to file: {file_jsonl}')
        print(path)
    print('complete')
    return output_list
            
test_sents = []
test_sents = sent_select(cnd.nlp, sent_dict, parse = True, path = path, file = "test_sents")
        

2 / 253


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54
text,And,as,far,as,the foreign policy,was,concerned,",",the report,exposed,not,only,how,this policy,has,disregarded,the Islamic issues,and,ignored,the Muslims,",",but,also,how,help,and,support,were,provided,to,the enemy,against,the Muslims,;,the cases,of,Gaza/Ariha,and,the Communists,in,the south,of,Yemen,are,still,fresh,in,the memory,",",and,more,can,be,said,.
ent_type,,,,,,,,,,,,,,,,,,,,NORP,,,,,,,,,,,,,NORP,,,,,,NORP,,,,GPE,,,,,,,,,,,,
concept,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
attribute,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
ideology,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


add to test_sents (y), delete previous (d), quit (q), back (b) q


written 1 entries to file: test_sents.jsonl
C:\Users\Steve\OneDrive - University of Southampton\CNDPipeline\dataset
complete


In [None]:



def custom_chunks(doclike):
    """
    custom extension for noun_chunks incorporating custom attributes
    
    adapted from noun_chunk extension in spaCy library
    https://github.com/explosion/spaCy/blob/master/spacy/lang/en/syntax_iterators.py
    """
    labels = [
        "nsubj",
        "dobj",
        "nsubjpass",
        "pcomp",
        "pobj",
        "dative",
        "appos",
        "attr",
        "ROOT",
    ]
    doc = doclike.doc  # Ensure works on both Doc and Span.

    np_deps = [doc.vocab.strings.add(label) for label in labels]
    conj = doc.vocab.strings.add("conj")
    np_label = doc.vocab.strings.add("NP")
    prev_end = -1

    predicates = [
            'able', 'available', 'brief', 'certain',
            'different', 'due', 'enough', 'especially', 'few', 'fifth',
            'former', 'his', 'howbeit', 'immediate', 'important', 'inc',
            'its', 'last', 'latter', 'least', 'less', 'likely', 'little',
            'many', 'ml', 'more', 'most', 'much', 'my', 'necessary',
            'new', 'next', 'non', 'old', 'other', 'our', 'ours', 'own',
            'particular', 'past', 'possible', 'present', 'proud', 'recent',
            'same', 'several', 'significant', 'similar', 'some', 'such', 'sup', 'sure'
        ]

    for i, word in enumerate(doclike):
        
        if word.pos_ not in ["NOUN", "PROPN", "PRON"]:
            continue
        
        # Prevent nested chunks from being produced
        if word.left_edge.i <= prev_end:
            continue
        
        if word.dep in np_deps:
            prev_end = word.i
            # print(word.left_edge.i, '=>', prev_end)
            # print("first yield: ", doc[word.left_edge.i: word.i + 1])
            # yield word.left_edge.i, word.i + 1, np_label

            span = Span(doc, word.left_edge.i, word.i + 1)

            counter = 0
            for tok in span:
                if tok.lemma_ in predicates:
                    counter += 1
            
            #remove empty spans, eg the noun_chunk 'others' becomes a zero length span
            if len(span[counter:]) == 0:
                counter = 0

            yield span[counter:]
    
        elif word.dep == conj:
            head = word.head
            while head.dep == conj and head.head.i < head.i:
                head = head.head
            # If the head is an NP, and we're coordinated to it, we're an NP
            if head.dep in np_deps:
                prev_end = word.i
                # print("second yield: ", doc[word.left_edge.i: word.i + 1])
                # yield word.left_edge.i, word.i + 1, np_label
                span = Span(doc, word.left_edge.i, word.i + 1)

                counter = 0
                for tok in span:
                    if tok.lemma_ in predicates:
                        counter += 1
                
                #remove empty spans, eg the noun_chunk 'others' becomes a zero length span
                if len(span[counter:]) == 0:
                    counter = 0
                        
                yield span[counter:]

def merge_named_concepts(doc):

    """Merge named concepts into a single token.
    doc (Doc): The Doc object.
    RETURNS (Doc): The Doc object with merged named concepts.

    Adapts the spacy merge noun chunks function
    code: https://github.com/explosion/spaCy/blob/master/spacy/pipeline/functions.py
    """
    if not doc.is_parsed:
        return doc
    with doc.retokenize() as retokenizer:
        for span in doc._.named_concepts:
            attrs = {
                    "tag": span.root.tag, 
                    "dep": span.root.dep
                    }

            retokenizer.merge(span, attrs=attrs)
    # "_" : {"CONCEPT" : span._.CONCEPT,
    #                         "IDEOLOGY" : span._.IDEOLOGY,
    #                         "ATTRIBUTE" : span._.ATTRIBUTE},
    
    return doc