<a href="https://colab.research.google.com/github/AminaZahid/Name-Entity-Extraction/blob/main/Entity_Name_Extraction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [11]:
import pandas as pd
import spacy
from spacy.displacy.render import EntityRenderer
from IPython.core.display import display, HTML



In [12]:
def custom_render(doc, df, column, options={}, page=False, minify=False, idx=0):
    """Overload the spaCy built-in rendering to allow custom part-of-speech (POS) tags.
    
    Keyword arguments:
    doc -- a spaCy nlp doc object
    df -- a pandas dataframe object
    column -- the name of of a column of interest in the dataframe
    options -- various options to feed into the spaCy renderer, including colors
    page -- rendering markup as full HTML page (default False)
    minify -- for compact HTML (default False)
    idx -- index for specific query or doc in dataframe (default 0)
    
    """
    renderer, converter = EntityRenderer, parse_custom_ents
    renderer = renderer(options=options)
    parsed = [converter(doc, df=df, idx=idx, column=column)]
    html = renderer.render(parsed, page=page, minify=minify).strip()  
    return display(HTML(html))

def parse_custom_ents(doc, df, idx, column):
    """Parse custom entity types that aren't in the original spaCy module.
    
    Keyword arguments:
    doc -- a spaCy nlp doc object
    df -- a pandas dataframe object
    idx -- index for specific query or doc in dataframe
    column -- the name of of a column of interest in the dataframe
    
    """
    if column in df.columns:
        entities = df[column][idx]
        ents = [{'start': ent[1], 'end': ent[2], 'label': ent[3]} 
                for ent in entities]
    else:
        ents = [{'start': ent.start_char, 'end': ent.end_char, 'label': ent.label_}
            for ent in doc.ents]
    return {'text': doc.text, 'ents': ents, 'title': None}

def render_entities(idx, df, options={}, column='named_ents'):
    """A wrapper function to get text from a dataframe and render it visually in jupyter notebooks
    
    Keyword arguments:
    idx -- index for specific query or doc in dataframe (default 0)
    df -- a pandas dataframe object
    options -- various options to feed into the spaCy renderer, including colors
    column -- the name of of a column of interest in the dataframe (default 'named_ents')
    
    """
    text = df['text'][idx]
    custom_render(nlp(text), df=df, column=column, options=options, idx=idx)
# colors for additional part of speech tags we want to visualize
options = {
    'colors': {'COMPOUND': '#FE6BFE', 'PROPN': '#18CFE6', 'NOUN': '#18CFE6', 'NP': '#1EECA6', 'ENTITY': '#FF8800'}
}

In [13]:
pd.set_option('display.max_rows', 10) # edit how jupyter will render our pandas dataframes
pd.options.mode.chained_assignment = None # prevent warning about working on a copy of a dataframe

In [14]:
nlp = spacy.load('en_core_web_sm')

In [15]:
df = pd.read_csv('/content/drive/MyDrive/ner.csv')

mini_df = df[:10]
mini_df.index = pd.RangeIndex(len(mini_df.index))

# comment this out to run on full dataset
df = mini_df

In [16]:
df.head(3)

Unnamed: 0,Sentence #,Sentence,POS,Tag
0,Sentence: 1,Thousands of demonstrators have marched throug...,"['NNS', 'IN', 'NNS', 'VBP', 'VBN', 'IN', 'NNP'...","['O', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', '..."
1,Sentence: 2,Families of soldiers killed in the conflict jo...,"['NNS', 'IN', 'NNS', 'VBN', 'IN', 'DT', 'NN', ...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."
2,Sentence: 3,They marched from the Houses of Parliament to ...,"['PRP', 'VBD', 'IN', 'DT', 'NNS', 'IN', 'NN', ...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."


In [17]:
lower = lambda x: x.lower() # make everything lowercase

In [18]:
df = pd.DataFrame(df['Sentence'].apply(lower))
df.columns = ['text']
display(df)

Unnamed: 0,text
0,thousands of demonstrators have marched throug...
1,families of soldiers killed in the conflict jo...
2,they marched from the houses of parliament to ...
3,"police put the number of marchers at 10,000 wh..."
4,the protest comes on the eve of the annual con...
5,the party is divided over britain 's participa...
6,the london march came ahead of anti-war protes...
7,the international atomic energy agency is to h...
8,iran this week restarted parts of the conversi...
9,iranian officials say they expect to get acces...


In [19]:
def extract_named_ents(text):
    """Extract named entities, and beginning, middle and end idx using spaCy's out-of-the-box model. 
    
    Keyword arguments:
    text -- the actual text source from which to extract entities
    
    """
    return [(ent.text, ent.start_char, ent.end_char, ent.label_) for ent in nlp(text).ents]

def add_named_ents(df):
    """Create new column in data frame with named entity tuple extracted.
    
    Keyword arguments:
    df -- a dataframe object
    
    """
    df['named_ents'] = df['text'].apply(extract_named_ents)    
add_named_ents(df)
display(df)

Unnamed: 0,text,named_ents
0,thousands of demonstrators have marched throug...,"[(thousands, 0, 9, CARDINAL), (london, 48, 54,..."
1,families of soldiers killed in the conflict jo...,"[(bush, 109, 113, PERSON), (one, 121, 124, CAR..."
2,they marched from the houses of parliament to ...,"[(hyde park, 57, 66, GPE)]"
3,"police put the number of marchers at 10,000 wh...","[(10,000, 37, 43, CARDINAL), (1,00,000, 76, 84..."
4,the protest comes on the eve of the annual con...,"[(annual, 36, 42, DATE), (britain, 57, 64, GPE..."
5,the party is divided over britain 's participa...,"[(britain, 26, 33, GPE), (iraq, 58, 62, GPE), ..."
6,the london march came ahead of anti-war protes...,"[(london, 4, 10, GPE), (today, 49, 54, DATE), ..."
7,the international atomic energy agency is to h...,"[(second day, 50, 60, DATE), (vienna, 73, 79, ..."
8,iran this week restarted parts of the conversi...,"[(iran, 0, 4, GPE), (this week, 5, 14, DATE)]"
9,iranian officials say they expect to get acces...,"[(iranian, 0, 7, NORP), (wednesday, 87, 96, DA..."


In [20]:
column = 'named_ents'
render_entities(9, df, options=options, column=column) # take a look at one of the abstracts

In [21]:
def extract_nouns(text):
    """Extract a few types of nouns, and beginning, middle and end idx using spaCy's POS (part of speech) tagger. 
    
    Keyword arguments:
    text -- the actual text source from which to extract entities
    
    """
    keep_pos = ['PROPN', 'NOUN']
    return [(tok.text, tok.idx, tok.idx+len(tok.text), tok.pos_) for tok in nlp(text) if tok.pos_ in keep_pos]

def add_nouns(df):
    """Create new column in data frame with nouns extracted.
    
    Keyword arguments:
    df -- a dataframe object
    
    """
    df['nouns'] = df['text'].apply(extract_nouns)
add_nouns(df)
display(df)

Unnamed: 0,text,named_ents,nouns
0,thousands of demonstrators have marched throug...,"[(thousands, 0, 9, CARDINAL), (london, 48, 54,...","[(thousands, 0, 9, NOUN), (demonstrators, 13, ..."
1,families of soldiers killed in the conflict jo...,"[(bush, 109, 113, PERSON), (one, 121, 124, CAR...","[(families, 0, 8, NOUN), (soldiers, 12, 20, NO..."
2,they marched from the houses of parliament to ...,"[(hyde park, 57, 66, GPE)]","[(houses, 22, 28, NOUN), (parliament, 32, 42, ..."
3,"police put the number of marchers at 10,000 wh...","[(10,000, 37, 43, CARDINAL), (1,00,000, 76, 84...","[(police, 0, 6, NOUN), (number, 15, 21, NOUN),..."
4,the protest comes on the eve of the annual con...,"[(annual, 36, 42, DATE), (britain, 57, 64, GPE...","[(protest, 4, 11, NOUN), (eve, 25, 28, NOUN), ..."
5,the party is divided over britain 's participa...,"[(britain, 26, 33, GPE), (iraq, 58, 62, GPE), ...","[(party, 4, 9, NOUN), (britain, 26, 33, PROPN)..."
6,the london march came ahead of anti-war protes...,"[(london, 4, 10, GPE), (today, 49, 54, DATE), ...","[(london, 4, 10, PROPN), (march, 11, 16, PROPN..."
7,the international atomic energy agency is to h...,"[(second day, 50, 60, DATE), (vienna, 73, 79, ...","[(energy, 25, 31, NOUN), (agency, 32, 38, NOUN..."
8,iran this week restarted parts of the conversi...,"[(iran, 0, 4, GPE), (this week, 5, 14, DATE)]","[(iran, 0, 4, PROPN), (week, 10, 14, NOUN), (p..."
9,iranian officials say they expect to get acces...,"[(iranian, 0, 7, NORP), (wednesday, 87, 96, DA...","[(officials, 8, 17, NOUN), (access, 41, 47, NO..."


In [22]:
column = 'nouns'
render_entities(0, df, options=options, column=column)

In [23]:
def extract_named_nouns(row_series):
    """Combine nouns and non-numerical entities. 
    
    Keyword arguments:
    row_series -- a Pandas Series object
    
    """
    ents = set()
    idxs = set()
    # remove duplicates and merge two lists together
    for noun_tuple in row_series['nouns']:
        for named_ents_tuple in row_series['named_ents']:
            if noun_tuple[1] == named_ents_tuple[1]: 
                idxs.add(noun_tuple[1])
                ents.add(named_ents_tuple)
        if noun_tuple[1] not in idxs:
            ents.add(noun_tuple)
    
    return sorted(list(ents), key=lambda x: x[1])

def add_named_nouns(df):
    """Create new column in data frame with nouns and named ents.
    
    Keyword arguments:
    df -- a dataframe object
    
    """
    df['named_nouns'] = df.apply(extract_named_nouns, axis=1)
add_named_nouns(df)
display(df)

Unnamed: 0,text,named_ents,nouns,named_nouns
0,thousands of demonstrators have marched throug...,"[(thousands, 0, 9, CARDINAL), (london, 48, 54,...","[(thousands, 0, 9, NOUN), (demonstrators, 13, ...","[(thousands, 0, 9, CARDINAL), (demonstrators, ..."
1,families of soldiers killed in the conflict jo...,"[(bush, 109, 113, PERSON), (one, 121, 124, CAR...","[(families, 0, 8, NOUN), (soldiers, 12, 20, NO...","[(families, 0, 8, NOUN), (soldiers, 12, 20, NO..."
2,they marched from the houses of parliament to ...,"[(hyde park, 57, 66, GPE)]","[(houses, 22, 28, NOUN), (parliament, 32, 42, ...","[(houses, 22, 28, NOUN), (parliament, 32, 42, ..."
3,"police put the number of marchers at 10,000 wh...","[(10,000, 37, 43, CARDINAL), (1,00,000, 76, 84...","[(police, 0, 6, NOUN), (number, 15, 21, NOUN),...","[(police, 0, 6, NOUN), (number, 15, 21, NOUN),..."
4,the protest comes on the eve of the annual con...,"[(annual, 36, 42, DATE), (britain, 57, 64, GPE...","[(protest, 4, 11, NOUN), (eve, 25, 28, NOUN), ...","[(protest, 4, 11, NOUN), (eve, 25, 28, NOUN), ..."
5,the party is divided over britain 's participa...,"[(britain, 26, 33, GPE), (iraq, 58, 62, GPE), ...","[(party, 4, 9, NOUN), (britain, 26, 33, PROPN)...","[(party, 4, 9, NOUN), (britain, 26, 33, GPE), ..."
6,the london march came ahead of anti-war protes...,"[(london, 4, 10, GPE), (today, 49, 54, DATE), ...","[(london, 4, 10, PROPN), (march, 11, 16, PROPN...","[(london, 4, 10, GPE), (march, 11, 16, PROPN),..."
7,the international atomic energy agency is to h...,"[(second day, 50, 60, DATE), (vienna, 73, 79, ...","[(energy, 25, 31, NOUN), (agency, 32, 38, NOUN...","[(energy, 25, 31, NOUN), (agency, 32, 38, NOUN..."
8,iran this week restarted parts of the conversi...,"[(iran, 0, 4, GPE), (this week, 5, 14, DATE)]","[(iran, 0, 4, PROPN), (week, 10, 14, NOUN), (p...","[(iran, 0, 4, GPE), (week, 10, 14, NOUN), (par..."
9,iranian officials say they expect to get acces...,"[(iranian, 0, 7, NORP), (wednesday, 87, 96, DA...","[(officials, 8, 17, NOUN), (access, 41, 47, NO...","[(officials, 8, 17, NOUN), (access, 41, 47, NO..."


In [24]:
column = 'named_nouns'
render_entities(1, df, options=options, column=column)

In [25]:
text = "Dr. Abraham is the primary author of this paper, and a physician in the specialty of internal medicine."

spacy.displacy.render(nlp(text), jupyter=True) # generating raw-markup using spacy's built-in renderer

In [26]:
def extract_noun_phrases(text):
    """Combine noun phrases. 
    
    Keyword arguments:
    text -- the actual text source from which to extract entities
    
    """
    return [(chunk.text, chunk.start_char, chunk.end_char, chunk.label_) for chunk in nlp(text).noun_chunks]

def add_noun_phrases(df):
    """Create new column in data frame with noun phrases.
    
    Keyword arguments:
    df -- a dataframe object
    
    """
    df['noun_phrases'] = df['text'].apply(extract_noun_phrases)
def visualize_noun_phrases(text):
    """Create a temporary dataframe to extract and visualize noun phrases. 
    
    Keyword arguments:
    text -- the actual text source from which to extract entities
    
    """
    df = pd.DataFrame([text]) 
    df.columns = ['text']
    add_noun_phrases(df)
    column = 'noun_phrases'
    render_entities(0, df, options=options, column=column)
visualize_noun_phrases(text)

In [27]:
add_noun_phrases(df)
display(df)

Unnamed: 0,text,named_ents,nouns,named_nouns,noun_phrases
0,thousands of demonstrators have marched throug...,"[(thousands, 0, 9, CARDINAL), (london, 48, 54,...","[(thousands, 0, 9, NOUN), (demonstrators, 13, ...","[(thousands, 0, 9, CARDINAL), (demonstrators, ...","[(thousands, 0, 9, NP), (demonstrators, 13, 26..."
1,families of soldiers killed in the conflict jo...,"[(bush, 109, 113, PERSON), (one, 121, 124, CAR...","[(families, 0, 8, NOUN), (soldiers, 12, 20, NO...","[(families, 0, 8, NOUN), (soldiers, 12, 20, NO...","[(families, 0, 8, NP), (soldiers, 12, 20, NP),..."
2,they marched from the houses of parliament to ...,"[(hyde park, 57, 66, GPE)]","[(houses, 22, 28, NOUN), (parliament, 32, 42, ...","[(houses, 22, 28, NOUN), (parliament, 32, 42, ...","[(they, 0, 4, NP), (the houses, 18, 28, NP), (..."
3,"police put the number of marchers at 10,000 wh...","[(10,000, 37, 43, CARDINAL), (1,00,000, 76, 84...","[(police, 0, 6, NOUN), (number, 15, 21, NOUN),...","[(police, 0, 6, NOUN), (number, 15, 21, NOUN),...","[(police, 0, 6, NP), (the number, 11, 21, NP),..."
4,the protest comes on the eve of the annual con...,"[(annual, 36, 42, DATE), (britain, 57, 64, GPE...","[(protest, 4, 11, NOUN), (eve, 25, 28, NOUN), ...","[(protest, 4, 11, NOUN), (eve, 25, 28, NOUN), ...","[(the protest, 0, 11, NP), (the eve, 21, 28, N..."
5,the party is divided over britain 's participa...,"[(britain, 26, 33, GPE), (iraq, 58, 62, GPE), ...","[(party, 4, 9, NOUN), (britain, 26, 33, PROPN)...","[(party, 4, 9, NOUN), (britain, 26, 33, GPE), ...","[(the party, 0, 9, NP), (britain 's participat..."
6,the london march came ahead of anti-war protes...,"[(london, 4, 10, GPE), (today, 49, 54, DATE), ...","[(london, 4, 10, PROPN), (march, 11, 16, PROPN...","[(london, 4, 10, GPE), (march, 11, 16, PROPN),...","[(the london march, 0, 16, NP), (anti-war prot..."
7,the international atomic energy agency is to h...,"[(second day, 50, 60, DATE), (vienna, 73, 79, ...","[(energy, 25, 31, NOUN), (agency, 32, 38, NOUN...","[(energy, 25, 31, NOUN), (agency, 32, 38, NOUN...","[(the international atomic energy agency, 0, 3..."
8,iran this week restarted parts of the conversi...,"[(iran, 0, 4, GPE), (this week, 5, 14, DATE)]","[(iran, 0, 4, PROPN), (week, 10, 14, NOUN), (p...","[(iran, 0, 4, GPE), (week, 10, 14, NOUN), (par...","[(iran, 0, 4, NP), (parts, 25, 30, NP), (the c..."
9,iranian officials say they expect to get acces...,"[(iranian, 0, 7, NORP), (wednesday, 87, 96, DA...","[(officials, 8, 17, NOUN), (access, 41, 47, NO...","[(officials, 8, 17, NOUN), (access, 41, 47, NO...","[(iranian officials, 0, 17, NP), (they, 22, 26..."


In [28]:
column = 'noun_phrases'
render_entities(0, df, options=options, column=column)

In [29]:
def extract_compounds(text):
    """Extract compound noun phrases with beginning and end idxs. 
    
    Keyword arguments:
    text -- the actual text source from which to extract entities
    
    """
    comp_idx = 0
    compound = []
    compound_nps = []
    tok_idx = 0
    for idx, tok in enumerate(nlp(text)):
        if tok.dep_ == 'compound':

            # capture hyphenated compounds
            children = ''.join([c.text for c in tok.children])
            if '-' in children:
                compound.append(''.join([children, tok.text]))
            else:
                compound.append(tok.text)

            # remember starting index of first child in compound or word
            try:
                tok_idx = [c for c in tok.children][0].idx
            except IndexError:
                if len(compound) == 1:
                    tok_idx = tok.idx
            comp_idx = tok.i

        # append the last word in a compound phrase
        if tok.i - comp_idx == 1:
            compound.append(tok.text)
            if len(compound) > 1: 
                compound = ' '.join(compound)
                compound_nps.append((compound, tok_idx, tok_idx+len(compound), 'COMPOUND'))

            # reset parameters
            tok_idx = 0 
            compound = []

    return compound_nps

def add_compounds(df):
    """Create new column in data frame with compound noun phrases.
    
    Keyword arguments:
    df -- a dataframe object
    
    """
    df['compounds'] = df['text'].apply(extract_compounds)
add_compounds(df)
display(df)


Unnamed: 0,text,named_ents,nouns,named_nouns,noun_phrases,compounds
0,thousands of demonstrators have marched throug...,"[(thousands, 0, 9, CARDINAL), (london, 48, 54,...","[(thousands, 0, 9, NOUN), (demonstrators, 13, ...","[(thousands, 0, 9, CARDINAL), (demonstrators, ...","[(thousands, 0, 9, NP), (demonstrators, 13, 26...",[]
1,families of soldiers killed in the conflict jo...,"[(bush, 109, 113, PERSON), (one, 121, 124, CAR...","[(families, 0, 8, NOUN), (soldiers, 12, 20, NO...","[(families, 0, 8, NOUN), (soldiers, 12, 20, NO...","[(families, 0, 8, NP), (soldiers, 12, 20, NP),...","[(bush number, 109, 120, COMPOUND)]"
2,they marched from the houses of parliament to ...,"[(hyde park, 57, 66, GPE)]","[(houses, 22, 28, NOUN), (parliament, 32, 42, ...","[(houses, 22, 28, NOUN), (parliament, 32, 42, ...","[(they, 0, 4, NP), (the houses, 18, 28, NP), (...","[(hyde park, 57, 66, COMPOUND)]"
3,"police put the number of marchers at 10,000 wh...","[(10,000, 37, 43, CARDINAL), (1,00,000, 76, 84...","[(police, 0, 6, NOUN), (number, 15, 21, NOUN),...","[(police, 0, 6, NOUN), (number, 15, 21, NOUN),...","[(police, 0, 6, NP), (the number, 11, 21, NP),...",[]
4,the protest comes on the eve of the annual con...,"[(annual, 36, 42, DATE), (britain, 57, 64, GPE...","[(protest, 4, 11, NOUN), (eve, 25, 28, NOUN), ...","[(protest, 4, 11, NOUN), (eve, 25, 28, NOUN), ...","[(the protest, 0, 11, NP), (the eve, 21, 28, N...","[(labor party, 75, 86, COMPOUND), (english sea..."
5,the party is divided over britain 's participa...,"[(britain, 26, 33, GPE), (iraq, 58, 62, GPE), ...","[(party, 4, 9, NOUN), (britain, 26, 33, PROPN)...","[(party, 4, 9, NOUN), (britain, 26, 33, GPE), ...","[(the party, 0, 9, NP), (britain 's participat...","[(iraq conflict, 58, 71, COMPOUND)]"
6,the london march came ahead of anti-war protes...,"[(london, 4, 10, GPE), (today, 49, 54, DATE), ...","[(london, 4, 10, PROPN), (march, 11, 16, PROPN...","[(london, 4, 10, GPE), (march, 11, 16, PROPN),...","[(the london march, 0, 16, NP), (anti-war prot...","[(london march, 4, 16, COMPOUND)]"
7,the international atomic energy agency is to h...,"[(second day, 50, 60, DATE), (vienna, 73, 79, ...","[(energy, 25, 31, NOUN), (agency, 32, 38, NOUN...","[(energy, 25, 31, NOUN), (agency, 32, 38, NOUN...","[(the international atomic energy agency, 0, 3...","[(energy agency, 18, 31, COMPOUND), (low-level..."
8,iran this week restarted parts of the conversi...,"[(iran, 0, 4, GPE), (this week, 5, 14, DATE)]","[(iran, 0, 4, PROPN), (week, 10, 14, NOUN), (p...","[(iran, 0, 4, GPE), (week, 10, 14, NOUN), (par...","[(iran, 0, 4, NP), (parts, 25, 30, NP), (the c...","[(conversion process, 38, 56, COMPOUND)]"
9,iranian officials say they expect to get acces...,"[(iranian, 0, 7, NORP), (wednesday, 87, 96, DA...","[(officials, 8, 17, NOUN), (access, 41, 47, NO...","[(officials, 8, 17, NOUN), (access, 41, 47, NO...","[(iranian officials, 0, 17, NP), (they, 22, 26...","[(iaea surveillance system, 108, 132, COMPOUND)]"


In [30]:
column = 'compounds'
render_entities(0, df, options=options, column=column)

In [31]:
def extract_comp_nouns(row_series, cols=[]):
    """Combine compound noun phrases and entities. 
    
    Keyword arguments:
    row_series -- a Pandas Series object
    
    """
    return {noun_tuple[0] for col in cols for noun_tuple in row_series[col]}

def add_comp_nouns(df, cols=[]):
    """Create new column in data frame with merged entities.
    
    Keyword arguments:
    df -- a dataframe object
    cols -- a list of column names that need to be merged
    
    """
    df['comp_nouns'] = df.apply(extract_comp_nouns, axis=1, cols=cols)
cols = ['nouns', 'compounds']
add_comp_nouns(df, cols=cols)
display(df)

Unnamed: 0,text,named_ents,nouns,named_nouns,noun_phrases,compounds,comp_nouns
0,thousands of demonstrators have marched throug...,"[(thousands, 0, 9, CARDINAL), (london, 48, 54,...","[(thousands, 0, 9, NOUN), (demonstrators, 13, ...","[(thousands, 0, 9, CARDINAL), (demonstrators, ...","[(thousands, 0, 9, NP), (demonstrators, 13, 26...",[],"{iraq, thousands, london, war, withdrawal, dem..."
1,families of soldiers killed in the conflict jo...,"[(bush, 109, 113, PERSON), (one, 121, 124, CAR...","[(families, 0, 8, NOUN), (soldiers, 12, 20, NO...","[(families, 0, 8, NOUN), (soldiers, 12, 20, NO...","[(families, 0, 8, NP), (soldiers, 12, 20, NP),...","[(bush number, 109, 120, COMPOUND)]","{soldiers, families, protesters, conflict, num..."
2,they marched from the houses of parliament to ...,"[(hyde park, 57, 66, GPE)]","[(houses, 22, 28, NOUN), (parliament, 32, 42, ...","[(houses, 22, 28, NOUN), (parliament, 32, 42, ...","[(they, 0, 4, NP), (the houses, 18, 28, NP), (...","[(hyde park, 57, 66, COMPOUND)]","{houses, park, rally, hyde park, parliament, h..."
3,"police put the number of marchers at 10,000 wh...","[(10,000, 37, 43, CARDINAL), (1,00,000, 76, 84...","[(police, 0, 6, NOUN), (number, 15, 21, NOUN),...","[(police, 0, 6, NOUN), (number, 15, 21, NOUN),...","[(police, 0, 6, NP), (the number, 11, 21, NP),...",[],"{marchers, organizers, police, number}"
4,the protest comes on the eve of the annual con...,"[(annual, 36, 42, DATE), (britain, 57, 64, GPE...","[(protest, 4, 11, NOUN), (eve, 25, 28, NOUN), ...","[(protest, 4, 11, NOUN), (eve, 25, 28, NOUN), ...","[(the protest, 0, 11, NP), (the eve, 21, 28, N...","[(labor party, 75, 86, COMPOUND), (english sea...","{labor, britain, english, english seaside reso..."
5,the party is divided over britain 's participa...,"[(britain, 26, 33, GPE), (iraq, 58, 62, GPE), ...","[(party, 4, 9, NOUN), (britain, 26, 33, PROPN)...","[(party, 4, 9, NOUN), (britain, 26, 33, GPE), ...","[(the party, 0, 9, NP), (britain 's participat...","[(iraq conflict, 58, 71, COMPOUND)]","{iraq, troops, conflict, britain, participatio..."
6,the london march came ahead of anti-war protes...,"[(london, 4, 10, GPE), (today, 49, 54, DATE), ...","[(london, 4, 10, PROPN), (march, 11, 16, PROPN...","[(london, 4, 10, GPE), (march, 11, 16, PROPN),...","[(the london march, 0, 16, NP), (anti-war prot...","[(london march, 4, 16, COMPOUND)]","{march, protests, london, paris, madrid, today..."
7,the international atomic energy agency is to h...,"[(second day, 50, 60, DATE), (vienna, 73, 79, ...","[(energy, 25, 31, NOUN), (agency, 32, 38, NOUN...","[(energy, 25, 31, NOUN), (agency, 32, 38, NOUN...","[(the international atomic energy agency, 0, 3...","[(energy agency, 18, 31, COMPOUND), (low-level...","{level, iran, wednesday, resumption, vienna, c..."
8,iran this week restarted parts of the conversi...,"[(iran, 0, 4, GPE), (this week, 5, 14, DATE)]","[(iran, 0, 4, PROPN), (week, 10, 14, NOUN), (p...","[(iran, 0, 4, GPE), (week, 10, 14, NOUN), (par...","[(iran, 0, 4, NP), (parts, 25, 30, NP), (the c...","[(conversion process, 38, 56, COMPOUND)]","{parts, iran, plant, process, conversion, conv..."
9,iranian officials say they expect to get acces...,"[(iranian, 0, 7, NORP), (wednesday, 87, 96, DA...","[(officials, 8, 17, NOUN), (access, 41, 47, NO...","[(officials, 8, 17, NOUN), (access, 41, 47, NO...","[(iranian officials, 0, 17, NP), (they, 22, 26...","[(iaea surveillance system, 108, 132, COMPOUND)]","{parts, plant, wednesday, access, surveillance..."


In [32]:
# take a look at all the nouns again
column = 'named_nouns'
render_entities(0, df, options=options, column=column)

In [33]:
# take a look at all the compound noun phrases again
column = 'compounds'
render_entities(0, df, options=options, column=column)

In [34]:
df['comp_nouns'][0] 

{'country',
 'demonstrators',
 'iraq',
 'london',
 'thousands',
 'troops',
 'war',
 'withdrawal'}

In [35]:
def drop_duplicate_np_splits(ents):
    """Drop any entities that are already captured by noun phrases. 
    
    Keyword arguments:
    ents -- a set of entities
    
    """
    drop_ents = set()
    for ent in ents:
        if len(ent.split(' ')) > 1:
            for e in ent.split(' '):
                if e in ents:
                    drop_ents.add(e)
    return ents - drop_ents

def drop_single_char_nps(ents):
    """Within an entity, drop single characters. 
    
    Keyword arguments:
    ents -- a set of entities
    
    """
    return {' '.join([e for e in ent.split(' ') if not len(e) == 1]) for ent in ents}

def drop_double_char(ents):
    """Drop any entities that are less than three characters. 
    
    Keyword arguments:
    ents -- a set of entities
    
    """
    drop_ents = {ent for ent in ents if len(ent) < 3}
    return ents - drop_ents

def keep_alpha(ents):
    """Keep only entities with alphabetical unicode characters, hyphens, and spaces. 
    
    Keyword arguments:
    ents -- a set of entities
    
    """
    keep_char = set('-abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ ')
    drop_ents = {ent for ent in ents if not set(ent).issubset(keep_char)}
    return ents - drop_ents

In [44]:
#filename = './top-freq-english-words/freq_words.csv'
freq_words_df = pd.read_csv('/content/drive/MyDrive/ner3.csv')
display(freq_words_df)

Unnamed: 0,PMID_Type,Sentence_Index,Token,Tag
0,21826085_A,0,We,O
1,21826085_A,0,implemented,O
2,21826085_A,0,a,O
3,21826085_A,0,two,O
4,21826085_A,0,-,O
...,...,...,...,...
667070,23645249_A,4,cells,O
667071,23645249_A,4,during,O
667072,23645249_A,4,differentiation,O
667073,23645249_A,4,to,O


In [41]:
freq_words = freq_words_df['Token'].iloc[1:]
display(freq_words)

1             implemented
2                       a
3                     two
4                       -
5                    step
               ...       
667070              cells
667071             during
667072    differentiation
667073                 to
667074         adipocytes
Name: Token, Length: 667074, dtype: object

In [46]:
def remove_freq_words(ents):
    """Drop any entities in the 5000 most common words in the English langauge. 
    
    Keyword arguments:
    ents -- a set of entities
    
    """
    freq_words = pd.read_csv('/content/drive/MyDrive/ner3.csv')['Token'].iloc[1:]
    for word in freq_words:
        try:
            ents.remove(word)
        except KeyError:
            continue # ignore the stop word if it's not in the list of abstract entities
    return ents

def add_clean_ents(df, funcs=[]):
    """Create new column in data frame with cleaned entities.
    
    Keyword arguments:
    df -- a dataframe object
    funcs -- a list of heuristic functions to be applied to entities
    
    """
    col = 'clean_ents'
    df[col] = df['comp_nouns']
    for f in funcs:
        df[col] = df[col].apply(f)

In [47]:
funcs = [drop_duplicate_np_splits, drop_double_char, keep_alpha, drop_single_char_nps, remove_freq_words]
add_clean_ents(df, funcs)
display(df)

Unnamed: 0,text,named_ents,nouns,named_nouns,noun_phrases,compounds,comp_nouns,clean_ents
0,thousands of demonstrators have marched throug...,"[(thousands, 0, 9, CARDINAL), (london, 48, 54,...","[(thousands, 0, 9, NOUN), (demonstrators, 13, ...","[(thousands, 0, 9, CARDINAL), (demonstrators, ...","[(thousands, 0, 9, NP), (demonstrators, 13, 26...",[],"{iraq, thousands, london, war, withdrawal, dem...","{iraq, london, war, demonstrators, troops}"
1,families of soldiers killed in the conflict jo...,"[(bush, 109, 113, PERSON), (one, 121, 124, CAR...","[(families, 0, 8, NOUN), (soldiers, 12, 20, NO...","[(families, 0, 8, NOUN), (soldiers, 12, 20, NO...","[(families, 0, 8, NP), (soldiers, 12, 20, NP),...","[(bush number, 109, 120, COMPOUND)]","{soldiers, families, protesters, conflict, num...","{soldiers, protesters, banners, slogans, bombi..."
2,they marched from the houses of parliament to ...,"[(hyde park, 57, 66, GPE)]","[(houses, 22, 28, NOUN), (parliament, 32, 42, ...","[(houses, 22, 28, NOUN), (parliament, 32, 42, ...","[(they, 0, 4, NP), (the houses, 18, 28, NP), (...","[(hyde park, 57, 66, COMPOUND)]","{houses, park, rally, hyde park, parliament, h...","{rally, parliament, hyde park}"
3,"police put the number of marchers at 10,000 wh...","[(10,000, 37, 43, CARDINAL), (1,00,000, 76, 84...","[(police, 0, 6, NOUN), (number, 15, 21, NOUN),...","[(police, 0, 6, NOUN), (number, 15, 21, NOUN),...","[(police, 0, 6, NP), (the number, 11, 21, NP),...",[],"{marchers, organizers, police, number}","{marchers, organizers}"
4,the protest comes on the eve of the annual con...,"[(annual, 36, 42, DATE), (britain, 57, 64, GPE...","[(protest, 4, 11, NOUN), (eve, 25, 28, NOUN), ...","[(protest, 4, 11, NOUN), (eve, 25, 28, NOUN), ...","[(the protest, 0, 11, NP), (the eve, 21, 28, N...","[(labor party, 75, 86, COMPOUND), (english sea...","{labor, britain, english, english seaside reso...","{britain, english seaside resort, labor party,..."
5,the party is divided over britain 's participa...,"[(britain, 26, 33, GPE), (iraq, 58, 62, GPE), ...","[(party, 4, 9, NOUN), (britain, 26, 33, PROPN)...","[(party, 4, 9, NOUN), (britain, 26, 33, GPE), ...","[(the party, 0, 9, NP), (britain 's participat...","[(iraq conflict, 58, 71, COMPOUND)]","{iraq, troops, conflict, britain, participatio...","{britain, troops, iraq conflict}"
6,the london march came ahead of anti-war protes...,"[(london, 4, 10, GPE), (today, 49, 54, DATE), ...","[(london, 4, 10, PROPN), (march, 11, 16, PROPN...","[(london, 4, 10, GPE), (march, 11, 16, PROPN),...","[(the london march, 0, 16, NP), (anti-war prot...","[(london march, 4, 16, COMPOUND)]","{march, protests, london, paris, madrid, today...","{protests, paris, madrid, cities, rome, london..."
7,the international atomic energy agency is to h...,"[(second day, 50, 60, DATE), (vienna, 73, 79, ...","[(energy, 25, 31, NOUN), (agency, 32, 38, NOUN...","[(energy, 25, 31, NOUN), (agency, 32, 38, NOUN...","[(the international atomic energy agency, 0, 3...","[(energy agency, 18, 31, COMPOUND), (low-level...","{level, iran, wednesday, resumption, vienna, c...","{iran, wednesday, vienna, energy agency, talks..."
8,iran this week restarted parts of the conversi...,"[(iran, 0, 4, GPE), (this week, 5, 14, DATE)]","[(iran, 0, 4, PROPN), (week, 10, 14, NOUN), (p...","[(iran, 0, 4, GPE), (week, 10, 14, NOUN), (par...","[(iran, 0, 4, NP), (parts, 25, 30, NP), (the c...","[(conversion process, 38, 56, COMPOUND)]","{parts, iran, plant, process, conversion, conv...","{iran, conversion process}"
9,iranian officials say they expect to get acces...,"[(iranian, 0, 7, NORP), (wednesday, 87, 96, DA...","[(officials, 8, 17, NOUN), (access, 41, 47, NO...","[(officials, 8, 17, NOUN), (access, 41, 47, NO...","[(iranian officials, 0, 17, NP), (they, 22, 26...","[(iaea surveillance system, 108, 132, COMPOUND)]","{parts, plant, wednesday, access, surveillance...","{wednesday, officials, iaea surveillance system}"


In [48]:
def visualize_entities(df, idx=0):
    """Visualize the entities for a given abstract in the dataframe. 
    
    Keyword arguments:
    df -- a dataframe object
    idx -- the index of interest for the dataframe (default 0)
    
    """
    # store entity start and end index for visualization in dummy df
    ents = []
    abstract = df['text'][idx]
    for ent in df['clean_ents'][idx]:
        i = abstract.find(ent) # locate the index of the entity in the abstract
        ents.append((ent, i, i+len(ent), 'ENTITY')) 
    ents.sort(key=lambda tup: tup[1])

    dummy_df = pd.DataFrame([abstract, ents]).T # transpose dataframe
    dummy_df.columns = ['text', 'clean_ents']
    column = 'clean_ents'
    render_entities(0, dummy_df, options=options, column=column)
visualize_entities(df, 0)
