# Merging Compounds
---

spaCy's merge noun chunks function is too course grained since it merges descriptive tokens of a noun as well as the consituent tokens of the noun. To merge tokens linked by the compound dependency is a more fine-grained way to separate out the noun from its descriptive elements.

The purpose of this note book is to create a pipeline component for merging compound words.


## Import and setup the pipeline

In [1]:
%%time
import os
import pandas as pd
import spacy
from cndlib.visuals import display_side_by_side

nlp = spacy.load("en_core_web_md")

merge_ents = nlp.create_pipe("merge_entities")
nlp.add_pipe(merge_ents, after = "ner")

print("Pipeline Components")
print(' | '.join(nlp.pipe_names))



Pipeline Components
tagger | parser | ner | merge_entities
Wall time: 5.32 s


## Get test data

In [2]:
%%time
docs = {"bush" : {},
       "binladen" : {}}

bush_dir = u"C:\\Users\\spa1e17\\OneDrive - University of Southampton\\Hostile-Narrative-Analysis\\dataset\\George Bush"
bush_filename = "20010920-Address to Joint Session of Congress Following 911 Attacks.txt"

with open(os.path.join(bush_dir, bush_filename), 'r') as fp:
    
    # get bush text
    docs["bush"]["text"] = fp.read()

binladen_dir = u"C:\\Users\\spa1e17\\OneDrive - University of Southampton\\Hostile-Narrative-Analysis\\dataset\\Osama bin Laden"
binladen_filename = "19960823-Declaration of Jihad Against the Americans Occupying the Land of the Two Holiest Sites.txt"

with open(os.path.join(binladen_dir, binladen_filename), 'r') as fp:
    
    # get bin laden text
    docs["binladen"]["text"] = fp.read()

Wall time: 2 ms


## Get Compound Phrases From the Text

In [3]:
%%time

def get_compound(chunk):
    
    """
    function which returns compound words of a token
    input: list of a token's left children
    output: the left most compound term
    """
    
    lefts_list = (list(chunk.root.lefts))
    for token in lefts_list:
        if token.dep_ == "compound":
            return token
        
    
for orator in docs:
    doc =  nlp(docs[orator]["text"])
    docs[orator]["Total Noun Chunks"] = len(list(doc.noun_chunks))
    
    columns = ["Chunk Text", "Root Word", "Leftmost Compound", "New Chunk Text"]
    df = pd.DataFrame
    
    docs[orator]["Compound Chunks"] = pd.DataFrame([{"Chunk Text" : str(chunk.text), 
                                                 "Root Word" : str(chunk.root), 
                                                 "Leftmost Compound" : get_compound(chunk),
                                                 "New Chunk Text" : doc[get_compound(chunk).i : chunk.end].text}
                                                for chunk in doc.noun_chunks
                                              if get_compound(chunk)])

captions = [f"{orator} yields {len(docs[orator]['Compound Chunks'])} Compound Chunks out of {docs[orator]['Total Noun Chunks']} Total Noun Chunks"
            for orator in docs]

display_side_by_side([docs[orator]["Compound Chunks"] for orator in docs], captions)

Unnamed: 0,Chunk Text,Root Word,Leftmost Compound,New Chunk Text
0,Mr. President Pro Tempore,Pro Tempore,President,President Pro Tempore
1,our National Anthem,Anthem,National,National Anthem
2,the British Prime Minister,Minister,Prime,Prime Minister
3,surprise attacks,attacks,surprise,surprise attacks
4,a fringe form,form,fringe,fringe form
5,a fringe movement,movement,fringe,fringe movement
6,the Taliban regime,regime,Taliban,Taliban regime
7,the Taliban regime,regime,Taliban,Taliban regime
8,the Taliban regime,regime,Taliban,Taliban regime
9,United States authorities,authorities,United States,United States authorities

Unnamed: 0,Chunk Text,Root Word,Leftmost Compound,New Chunk Text
0,O ye,ye,O,O ye
1,O mankind,mankind,O,O mankind
2,your Guardian-Lord,Lord,Guardian,Guardian-Lord
3,O ye,ye,O,O ye
4,[Koranic verses,verses,Koranic,Koranic verses
5,[Koranic verses,verses,Koranic,Koranic verses
6,the Jewish-crusade alliance aggression,aggression,alliance,alliance aggression
7,the unjust crusade campaign,campaign,crusade,crusade campaign
8,al-'Izz Ibn-'Abd-al-Salam,Salam,Ibn-'Abd,Ibn-'Abd-al-Salam
9,this Jewish-crusade alliance,alliance,crusade,crusade alliance


Wall time: 2.92 s


## Merge Hyphenated Words

There are a several hyphenated words that are skewing tokenisation, therefore, the tokeniser is modified to merge hyphenated words.


spaCy refers to hyphens as infix terms, for which there is some code in the documentation, however, this seems to skew the dependency parsing: DO NOT USE

Code taken from this answer: https://stackoverflow.com/questions/52293874/why-does-spacy-not-preserve-intra-word-hyphens-during-tokenization-like-stanford/52380286#comment97213137_52380286

In [5]:
import re
from spacy.tokenizer import Tokenizer
from spacy.util import compile_infix_regex
from spacy.lang.char_classes import CONCAT_QUOTES, LIST_ELLIPSES, LIST_ICONS
from spacy.lang.char_classes import ALPHA, ALPHA_LOWER, ALPHA_UPPER

def custom_tokenizer(nlp):
    
    inf = list(nlp.Defaults.infixes)
    inf = [x for x in inf if '-|–|—|--|---|——|~' not in x] # remove the hyphen-between-letters pattern from infix patterns
    infix_re = compile_infix_regex(tuple(inf))

    infixes = (
        LIST_ELLIPSES
        + LIST_ICONS
        + [
            r'(?<=[0-9])[+\\-\\*^](?=[0-9-])',
            r'(?<=[{al}{q}])\\.(?=[{au}{q}])'.format(
                al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES
            ),
            # REMOVE: commented out regex that splits on hyphens between letters:
            #r"(?<=[{a}])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS),
            # EDIT: remove split on slash between letters, and add comma
            #r'(?<=[{a}0-9])[:<>=/](?=[{a}])'.format(a=ALPHA),
            r'(?<=[{a}0-9])[:<>=,](?=[{a}])'.format(a=ALPHA),
            # ADD: ampersand as an infix character except for dual upper FOO&FOO variant
            r'(?<=[{a}0-9])[&](?=[{al}0-9])'.format(a=ALPHA, al=ALPHA_LOWER),
            r'(?<=[{al}0-9])[&](?=[{a}0-9])'.format(a=ALPHA, al=ALPHA_LOWER),
        ]
    )
    
    infix_re = spacy.util.compile_infix_regex(infixes)
    
    return Tokenizer(nlp.vocab, prefix_search=nlp.tokenizer.prefix_search,
                                suffix_search=nlp.tokenizer.suffix_search,
                                infix_finditer=infix_re.finditer,
                                token_match=nlp.tokenizer.token_match,
                                rules=nlp.Defaults.tokenizer_exceptions)

nlp.tokenizer = custom_tokenizer(nlp)

def get_compound(chunk):
    
    """
    function which returns compound words of a token
    input: list of a token's left children
    output: the left most compound term
    """
    
    for token in list(chunk.root.lefts):
        if token.dep_ == "compound":
            return token

for orator in docs:
    doc =  nlp(docs[orator]["text"])
    docs[orator]["Total Noun Chunks"] = len(list(doc.noun_chunks))
    
    columns = ["Chunk Text", "Root Word", "Leftmost Compound", "New Chunk Text"]
    df = pd.DataFrame
    
    docs[orator]["Compound Chunks"] = pd.DataFrame([{"Chunk Text" : str(chunk.text), 
                                                 "Root Word" : str(chunk.root), 
                                                 "Leftmost Compound" : get_compound(chunk),
                                                 "New Chunk Text" : doc[get_compound(chunk).i : chunk.end].text}
                                                for chunk in doc.noun_chunks
                                              if get_compound(chunk)])

captions = [f"{orator} yields {len(docs[orator]['Compound Chunks'])} Compound Chunks out of {docs[orator]['Total Noun Chunks']} Total Noun Chunks"
            for orator in docs]

display_side_by_side([docs[orator]["Compound Chunks"] for orator in docs], captions)

Unnamed: 0,Chunk Text,Root Word,Leftmost Compound,New Chunk Text
0,Mr. President Pro Tempore,Pro Tempore,President,President Pro Tempore
1,our National Anthem,Anthem,National,National Anthem
2,the British Prime Minister,Minister,Prime,Prime Minister
3,surprise attacks,attacks,surprise,surprise attacks
4,a fringe form,form,fringe,fringe form
5,a fringe movement,movement,fringe,fringe movement
6,the Taliban regime,regime,Taliban,Taliban regime
7,the Taliban regime,regime,Taliban,Taliban regime
8,the Taliban regime,regime,Taliban,Taliban regime
9,United States authorities,authorities,United States,United States authorities

Unnamed: 0,Chunk Text,Root Word,Leftmost Compound,New Chunk Text
0,O ye,ye,O,O ye
1,O mankind,mankind,O,O mankind
2,O ye,ye,O,O ye
3,[Koranic verses,verses,Koranic,Koranic verses
4,[Koranic verses,verses,Koranic,Koranic verses
5,the Jewish-crusade alliance aggression,aggression,alliance,alliance aggression
6,the unjust crusade campaign,campaign,crusade,crusade campaign
7,this Jewish-crusade alliance,alliance,Jewish-crusade,Jewish-crusade alliance
8,Umar 'Abd-al Rahman,Rahman,Abd-al,Abd-al Rahman
9,the superpower myth,myth,superpower,superpower myth


In [6]:
from spacy.tokens import Token, Span
        
def merge_compounds(doc):
    
    """
    pipeline component to merge compound linked terms in a doc
    
    """
    
    Token.set_extension("compound_merge", default = False, force = True)
    
    def get_compound(chunk):
    
        """
        function which returns compound words of a token
        input: list of a token's left children
        output: the left most compound term
        """

        for token in list(chunk.root.lefts):
            if token.dep_ == "compound":
                return token

    with doc.retokenize() as retokenizer:

        for chunk in doc.noun_chunks:
            if chunk.root.dep_ == "compound":
                continue

            left_token = get_compound(chunk)
            
            if left_token:
    #             print(doc[left_token.i : chunk.end])
    
                entity_type = ""
                if left_token.ent_type:
                    entity_type = left_token.ent_type
                else:
                    entity_type = chunk.root.ent_type_
                
                attrs = {"ENT_TYPE" : entity_type,
                        "_" : {"compound_merge" : True}}
                retokenizer.merge(doc[left_token.i : chunk.end], attrs = attrs)
    
    return doc

if "merge_compounds" in nlp.pipe_names:
    nlp.remove_pipe("merge_compounds")
nlp.add_pipe(merge_compounds, last = True)

captions = [f"Compound Merges for {orator}" for orator in docs]

display_side_by_side([[(token.text, token.ent_type_) for token in nlp(docs[orator]["text"])
                     if token._.compound_merge]
                    for orator in docs], captions)        
        

Unnamed: 0,0,1
0,President Pro Tempore,PERSON
1,National Anthem,
2,Prime Minister,
3,surprise attacks,
4,fringe form,
5,fringe movement,
6,Taliban regime,ORG
7,Taliban regime,ORG
8,Taliban regime,ORG
9,United States authorities,GPE

Unnamed: 0,0,1
0,O ye,
1,O mankind,
2,O ye,
3,Koranic verses,WORK_OF_ART
4,Koranic verses,
5,alliance aggression,
6,crusade campaign,
7,Jewish-crusade alliance,
8,Abd-al Rahman,PERSON
9,superpower myth,
