In [8]:
!python --version

Python 3.7.11


In [9]:
# !python -m pip install spacy==2.1.0
# !pip install neuralcoref
# !python -m spacy download en_core_web_lg

## You have to restart runtime here :c

In [1]:
import pandas as pd
import numpy as np

In [2]:
# Load your usual SpaCy model (one of SpaCy English models)
import spacy
import re
nlp = spacy.load('en_core_web_lg')

# Add neural coref to SpaCy's pipeline
import neuralcoref
neuralcoref.add_to_pipe(nlp)

<spacy.lang.en.English at 0x7f77f24bdd90>

In [3]:
def clean_wikitext(text):
    cleaned = re.sub(r'\[.{1,2}\]', '', text)
    cleaned = re.sub(r'\(([^\)]*[^\)]*?)\)', '', cleaned)
    return cleaned
    
clean_wikitext('Maria "Maja" Einstein (18 November 1881 – 25 June 1951) and her older brother, Albert, were the two children of Hermann Einstein and Pauline Einstein (née Koch), who had moved from Ulm to Munich in June 1881, when Albert was one.[14] There Hermann and his brother Jakob had founded Einstein & Cie., an electrical ')

'Maria "Maja" Einstein  and her older brother, Albert, were the two children of Hermann Einstein and Pauline Einstein , who had moved from Ulm to Munich in June 1881, when Albert was one. There Hermann and his brother Jakob had founded Einstein & Cie., an electrical '

In [4]:
# You're done. You can now use NeuralCoref as you usually manipulate a SpaCy document annotations.
# doc = nlp(u'Kimberly Noel Kardashian was born on October 21, 1980, in Los Angeles, California, to Robert and Kris Kardashian. She has an older sister, Kourtney, a younger sister, Khloé, and a younger brother, Rob. Their mother is of Dutch, English, Irish, and Scottish ancestry, while their father was a third-generation Armenian-American. After their parents divorced in 1991, her mother married again that year, to Bruce Jenner, the 1976 Summer Olympics decathlon winner. Through their marriage, Kim Kardashian gained step-brothers Burton "Burt", Brandon, and Brody; step-sister Casey; and half-sisters Kendall and Kylie Jenner.')
doc = nlp(clean_wikitext('Maria "Maja" Einstein (18 November 1881 – 25 June 1951) and her older brother, Albert, were the two children of Hermann Einstein and Pauline Einstein (née Koch), who had moved from Ulm to Munich in June 1881, when Albert was one.[14] There Hermann and his brother Jakob had founded Einstein & Cie., an electrical engineering company.[15]'))
display(doc._.has_coref)
display(doc._.coref_clusters)
# display(doc._.coref_scores)
clusters = doc._.coref_clusters

True

[Maria "Maja" Einstein: [Maria "Maja" Einstein, her], Hermann: [Hermann, his]]

In [6]:
[d for d in doc.ents if d.label_ == 'PERSON']

[Maria "Maja" Einstein,
 Albert,
 Hermann Einstein,
 Pauline Einstein,
 Albert,
 Hermann,
 Jakob]

In [7]:
clusters[3].main.ents

IndexError: list index out of range

In [7]:
# Remove all clusters that don't have a named entity in the main resolved span
clusters = [c for c in clusters if len(c.main.ents) > 0]

# Remove all clusters that don't contain a PERSON named entity in the main resolved span
clusters = [c for c in clusters if len([c_ent for c_ent in c.main.ents if c_ent.label_ == 'PERSON']) > 0]

# TODO: Find a way to parse out people from groups (older sister, Kourtney, a younger sister, Khloé)
# and include their coreferences by part.
clusters

[Maria "Maja" Einstein: [Maria "Maja" Einstein, her], Hermann: [Hermann, his]]

In [None]:
display(doc._.coref_clusters[0].main)
display(doc._.coref_clusters[0].mentions)

In [None]:
type(doc._.coref_clusters[0])

In [9]:
found_entities = [(e, doc[:0]) for e in doc.ents if e.label_ == 'PERSON']
display(found_entities[:5])
type(found_entities[0])

[(Maria "Maja" Einstein, ),
 (Albert, ),
 (Hermann Einstein, ),
 (Pauline Einstein, ),
 (Albert, )]

tuple

In [10]:
# TODO: Might need to abstract found_entities to include who the entities are linke to (if any)
for c in clusters:
    for ment in c.mentions:
        if ment != c.main and (ment, c.main) not in found_entities:
            found_entities.append((ment, c.main))

display(found_entities)

[(Maria "Maja" Einstein, ),
 (Albert, ),
 (Hermann Einstein, ),
 (Pauline Einstein, ),
 (Albert, ),
 (Hermann, ),
 (Jakob, ),
 (her, Maria "Maja" Einstein),
 (his, Hermann)]

In [None]:
MAX_RANGE = 140

for (ent1, coref1) in found_entities:
    for (ent2, coref2) in found_entities:
        if ent2 == ent1 or coref2 == ent1 or coref1 == ent2:
            continue

        if ent2.end_char <= ent1.start_char + MAX_RANGE and ent1.start_char < ent2.start_char:
            print(f'ENT 1 ({ent1}), ENT 2 ({ent2})')
            rel_span = doc.text[ent1.start_char : ent2.end_char]
            encoded_e1 = f'[e1] {ent1.text} [/e1]'
            encoded_e2 = f'[e2] {ent2.text} [/e2]'
            # print(ent1.start_char)
            # print(rel_span[: ent1.start_char])
            # second_half = f'{encoded_e2} {rel_span[ent2.end_char]}'
            new_span = encoded_e1 + doc.text[ent1.end_char : ent2.start_char] + encoded_e2
            print(new_span)

In [27]:
pd.read_excel('NER_test.xlsx')['Entities'].to_list()[0].split(',')

['Albert Einstein',
 ' Hermann Einstein',
 ' Pauline Koch',
 ' Einstein',
 ' Jakob']

In [177]:
entities = pd.read_excel('NER_test.xlsx')
entities['cleaned_text'] = entities['Text'].apply(clean_wikitext)
entities['true_entities'] = entities['Entities'].apply(split_entities)
entities['predicted_entities'] = entities['cleaned_text'].apply(find_entities)
entities['predicted_entities'] = entities['predicted_entities'].apply(split_entities)
entities

Unnamed: 0,Text,Entities
0,"Albert Einstein was born in Ulm,[7] in the Kin...","Albert Einstein, Hermann Einstein, Pauline Koc..."
1,"Maria ""Maja"" Einstein (18 November 1881 – 25 J...","Maria ""Maja"" Einstein, Albert, Hermann Einstei..."
2,Kimberly Noel Kardashian was born on October 2...,"Kimberly Noel Kardashian, Robert, Kris Kardash..."
3,Peter Michael Davidson[4] was born on November...,"Peter Michael Davidson, Amy, Scott Matthew Dav..."
4,Kid Cudi was born Scott Ramon Seguro Mescudi i...,"Kid Cudi, Scott Ramon Seguro Mescudi, Domingo,..."
5,"Prince Rogers Nelson was born in Minneapolis, ...","Prince Rogers Nelson, Mattie Della, John Lewis..."
6,"Louis Sedell Hayes was born in Detroit, Michig...","Louis Sedell Hayes, John Nelson, Prince, Hayes..."
7,Coltrane was born in his parents' apartment at...,"Coltrane, John R. Coltrane, Alice Blair"


In [178]:
entities['cleaned_text'] = entities['Text'].apply(clean_wikitext)
entities

Unnamed: 0,Text,Entities,cleaned_text
0,"Albert Einstein was born in Ulm,[7] in the Kin...","Albert Einstein, Hermann Einstein, Pauline Koc...","Albert Einstein was born in Ulm, in the Kingdo..."
1,"Maria ""Maja"" Einstein (18 November 1881 – 25 J...","Maria ""Maja"" Einstein, Albert, Hermann Einstei...","Maria ""Maja"" Einstein and her older brother, ..."
2,Kimberly Noel Kardashian was born on October 2...,"Kimberly Noel Kardashian, Robert, Kris Kardash...",Kimberly Noel Kardashian was born on October 2...
3,Peter Michael Davidson[4] was born on November...,"Peter Michael Davidson, Amy, Scott Matthew Dav...",Peter Michael Davidson was born on November 16...
4,Kid Cudi was born Scott Ramon Seguro Mescudi i...,"Kid Cudi, Scott Ramon Seguro Mescudi, Domingo,...",Kid Cudi was born Scott Ramon Seguro Mescudi i...
5,"Prince Rogers Nelson was born in Minneapolis, ...","Prince Rogers Nelson, Mattie Della, John Lewis...","Prince Rogers Nelson was born in Minneapolis, ..."
6,"Louis Sedell Hayes was born in Detroit, Michig...","Louis Sedell Hayes, John Nelson, Prince, Hayes...","Louis Sedell Hayes was born in Detroit, Michig..."
7,Coltrane was born in his parents' apartment at...,"Coltrane, John R. Coltrane, Alice Blair",Coltrane was born in his parents' apartment at...


In [74]:
def find_entities(wikitext):
    doc = nlp(wikitext)
    entities = [d for d in doc.ents if d.label_ == 'PERSON']
    return ', '.join([str(e) for e in entities])

def split_entities(ent_string):
    arr = ent_string.split(',')
    return [e.strip() for e in arr]

In [179]:
entities['true_entities'] = entities['Entities'].apply(split_entities)

In [180]:
entities['predicted_entities'] = entities['cleaned_text'].apply(find_entities)
entities['predicted_entities'] = entities['predicted_entities'].apply(split_entities)

In [181]:
entities

Unnamed: 0,Text,Entities,cleaned_text,true_entities,predicted_entities
0,"Albert Einstein was born in Ulm,[7] in the Kin...","Albert Einstein, Hermann Einstein, Pauline Koc...","Albert Einstein was born in Ulm, in the Kingdo...","[Albert Einstein, Hermann Einstein, Pauline Ko...","[Albert Einstein, Hermann Einstein, Pauline Ko..."
1,"Maria ""Maja"" Einstein (18 November 1881 – 25 J...","Maria ""Maja"" Einstein, Albert, Hermann Einstei...","Maria ""Maja"" Einstein and her older brother, ...","[Maria ""Maja"" Einstein, Albert, Hermann Einste...","[Maria ""Maja"" Einstein, Albert, Hermann Einste..."
2,Kimberly Noel Kardashian was born on October 2...,"Kimberly Noel Kardashian, Robert, Kris Kardash...",Kimberly Noel Kardashian was born on October 2...,"[Kimberly Noel Kardashian, Robert, Kris Kardas...","[Kimberly Noel Kardashian, Robert, Kris Kardas..."
3,Peter Michael Davidson[4] was born on November...,"Peter Michael Davidson, Amy, Scott Matthew Dav...",Peter Michael Davidson was born on November 16...,"[Peter Michael Davidson, Amy, Scott Matthew Da...","[Peter Michael Davidson, Amy, Scott Matthew Da..."
4,Kid Cudi was born Scott Ramon Seguro Mescudi i...,"Kid Cudi, Scott Ramon Seguro Mescudi, Domingo,...",Kid Cudi was born Scott Ramon Seguro Mescudi i...,"[Kid Cudi, Scott Ramon Seguro Mescudi, Domingo...","[Kid Cudi, Scott Ramon Seguro Mescudi, Solon, ..."
5,"Prince Rogers Nelson was born in Minneapolis, ...","Prince Rogers Nelson, Mattie Della, John Lewis...","Prince Rogers Nelson was born in Minneapolis, ...","[Prince Rogers Nelson, Mattie Della, John Lewi...","[Prince Rogers Nelson, Mattie Della, John Lewi..."
6,"Louis Sedell Hayes was born in Detroit, Michig...","Louis Sedell Hayes, John Nelson, Prince, Hayes...","Louis Sedell Hayes was born in Detroit, Michig...","[Louis Sedell Hayes, John Nelson, Prince, Haye...","[Louis Sedell Hayes, John Nelson, Prince, Haye..."
7,Coltrane was born in his parents' apartment at...,"Coltrane, John R. Coltrane, Alice Blair",Coltrane was born in his parents' apartment at...,"[Coltrane, John R. Coltrane, Alice Blair]","[Coltrane, John R. Coltrane, Alice Blair, Will..."


In [85]:
entities['predicted_entities'][3]

['Peter Michael Davidson',
 'Amy',
 'Scott Matthew Davidson',
 'Requiem Mass',
 'Great Kills']

In [84]:
entities['true_entities'][3]

['Peter Michael Davidson', 'Amy', 'Scott Matthew Davidson']

In [205]:
entities

Unnamed: 0,Text,Entities,cleaned_text,true_entities,predicted_entities
0,"Albert Einstein was born in Ulm,[7] in the Kin...","Albert Einstein, Hermann Einstein, Pauline Koc...","Albert Einstein was born in Ulm, in the Kingdo...","[Albert Einstein, Hermann Einstein, Pauline Ko...","[Albert Einstein, Hermann Einstein, Pauline Ko..."
1,"Maria ""Maja"" Einstein (18 November 1881 – 25 J...","Maria ""Maja"" Einstein, Albert, Hermann Einstei...","Maria ""Maja"" Einstein and her older brother, ...","[Maria ""Maja"" Einstein, Albert, Hermann Einste...","[Maria ""Maja"" Einstein, Albert, Hermann Einste..."
2,Kimberly Noel Kardashian was born on October 2...,"Kimberly Noel Kardashian, Robert, Kris Kardash...",Kimberly Noel Kardashian was born on October 2...,"[Kimberly Noel Kardashian, Robert, Kris Kardas...","[Kimberly Noel Kardashian, Robert, Kris Kardas..."
3,Peter Michael Davidson[4] was born on November...,"Peter Michael Davidson, Amy, Scott Matthew Dav...",Peter Michael Davidson was born on November 16...,"[Peter Michael Davidson, Amy, Scott Matthew Da...","[Peter Michael Davidson, Amy, Scott Matthew Da..."
4,Kid Cudi was born Scott Ramon Seguro Mescudi i...,"Kid Cudi, Scott Ramon Seguro Mescudi, Domingo,...",Kid Cudi was born Scott Ramon Seguro Mescudi i...,"[Kid Cudi, Scott Ramon Seguro Mescudi, Domingo...","[Kid Cudi, Scott Ramon Seguro Mescudi, Solon, ..."
5,"Prince Rogers Nelson was born in Minneapolis, ...","Prince Rogers Nelson, Mattie Della, John Lewis...","Prince Rogers Nelson was born in Minneapolis, ...","[Prince Rogers Nelson, Mattie Della, John Lewi...","[Prince Rogers Nelson, Mattie Della, John Lewi..."
6,"Louis Sedell Hayes was born in Detroit, Michig...","Louis Sedell Hayes, John Nelson, Prince, Hayes...","Louis Sedell Hayes was born in Detroit, Michig...","[Louis Sedell Hayes, John Nelson, Prince, Haye...","[Louis Sedell Hayes, John Nelson, Prince, Haye..."
7,Coltrane was born in his parents' apartment at...,"Coltrane, John R. Coltrane, Alice Blair",Coltrane was born in his parents' apartment at...,"[Coltrane, John R. Coltrane, Alice Blair]","[Coltrane, John R. Coltrane, Alice Blair, Will..."
8,"Heath was born in Philadelphia on October 25, ...","Heath, Heath, Percy Heath, Albert Heath","Heath was born in Philadelphia on October 25, ...","[Heath, Heath, Percy Heath, Albert Heath]","[Percy Heath, Albert Heath]"
9,Heath originally played alto saxophone. He ear...,"Heath, Howard McGhee, Dizzy Gillespie, Charlie...",Heath originally played alto saxophone. He ear...,"[Heath, Howard McGhee, Dizzy Gillespie, Charli...","[Howard McGhee, Dizzy Gillespie, Charlie Parker]"


In [206]:
def evaluate_NER(true_entities, predicted_entities):
    tp, fp, fn = 0, 0, 0
    fp_arr = []
    fn_arr = []
    # print(true_entities)
    for i in range(len(true_entities)):
        true = copy.deepcopy(true_entities[i])
        preds = copy.deepcopy(predicted_entities[i])
        for e in preds:
            if e in true:
                # print(f'{e} in {true} -- TRUE')
                # preds.remove(e)
                true.remove(e)
                tp += 1
            else:
                # print(f'{e} in {true} -- FALSE')
                # preds.remove(e)
                fp_arr.append(e)
                fp += 1
        for et in true:
            fn_arr.append(et)
        fn += len(true)
    return ({'precision': tp / (tp + fp), 'recall': tp / (tp + fn)}, {'tp' : tp, 'fp': fp, 'fn': fn}, {'fp_arr': fp_arr, 'fn_arr': fn_arr})
    return tp, fp, fn

In [207]:
entities = pd.read_excel('NER_test.xlsx')
entities['cleaned_text'] = entities['Text'].apply(clean_wikitext)
entities['true_entities'] = entities['Entities'].apply(split_entities)
entities['predicted_entities'] = entities['cleaned_text'].apply(find_entities)
entities['predicted_entities'] = entities['predicted_entities'].apply(split_entities)
entities

Unnamed: 0,Text,Entities,cleaned_text,true_entities,predicted_entities
0,"Albert Einstein was born in Ulm,[7] in the Kin...","Albert Einstein, Hermann Einstein, Pauline Koc...","Albert Einstein was born in Ulm, in the Kingdo...","[Albert Einstein, Hermann Einstein, Pauline Ko...","[Albert Einstein, Hermann Einstein, Pauline Ko..."
1,"Maria ""Maja"" Einstein (18 November 1881 – 25 J...","Maria ""Maja"" Einstein, Albert, Hermann Einstei...","Maria ""Maja"" Einstein and her older brother, ...","[Maria ""Maja"" Einstein, Albert, Hermann Einste...","[Maria ""Maja"" Einstein, Albert, Hermann Einste..."
2,Kimberly Noel Kardashian was born on October 2...,"Kimberly Noel Kardashian, Robert, Kris Kardash...",Kimberly Noel Kardashian was born on October 2...,"[Kimberly Noel Kardashian, Robert, Kris Kardas...","[Kimberly Noel Kardashian, Robert, Kris Kardas..."
3,Peter Michael Davidson[4] was born on November...,"Peter Michael Davidson, Amy, Scott Matthew Dav...",Peter Michael Davidson was born on November 16...,"[Peter Michael Davidson, Amy, Scott Matthew Da...","[Peter Michael Davidson, Amy, Scott Matthew Da..."
4,Kid Cudi was born Scott Ramon Seguro Mescudi i...,"Kid Cudi, Scott Ramon Seguro Mescudi, Domingo,...",Kid Cudi was born Scott Ramon Seguro Mescudi i...,"[Kid Cudi, Scott Ramon Seguro Mescudi, Domingo...","[Kid Cudi, Scott Ramon Seguro Mescudi, Solon, ..."
5,"Prince Rogers Nelson was born in Minneapolis, ...","Prince Rogers Nelson, Mattie Della, John Lewis...","Prince Rogers Nelson was born in Minneapolis, ...","[Prince Rogers Nelson, Mattie Della, John Lewi...","[Prince Rogers Nelson, Mattie Della, John Lewi..."
6,"Louis Sedell Hayes was born in Detroit, Michig...","Louis Sedell Hayes, John Nelson, Prince, Hayes...","Louis Sedell Hayes was born in Detroit, Michig...","[Louis Sedell Hayes, John Nelson, Prince, Haye...","[Louis Sedell Hayes, John Nelson, Prince, Haye..."
7,Coltrane was born in his parents' apartment at...,"Coltrane, John R. Coltrane, Alice Blair",Coltrane was born in his parents' apartment at...,"[Coltrane, John R. Coltrane, Alice Blair]","[Coltrane, John R. Coltrane, Alice Blair, Will..."
8,"Heath was born in Philadelphia on October 25, ...","Heath, Heath, Percy Heath, Albert Heath","Heath was born in Philadelphia on October 25, ...","[Heath, Heath, Percy Heath, Albert Heath]","[Percy Heath, Albert Heath]"
9,Heath originally played alto saxophone. He ear...,"Heath, Howard McGhee, Dizzy Gillespie, Charlie...",Heath originally played alto saxophone. He ear...,"[Heath, Howard McGhee, Dizzy Gillespie, Charli...","[Howard McGhee, Dizzy Gillespie, Charlie Parker]"


In [208]:
import copy
gt = copy.deepcopy(entities['true_entities'].values)
preds = copy.deepcopy(entities['predicted_entities'].values)

In [209]:
evaluate_NER(gt, preds)

({'precision': 0.8951612903225806, 'recall': 0.8222222222222222},
 {'tp': 111, 'fp': 13, 'fn': 24},
 {'fp_arr': ['Requiem Mass',
   'Great Kills',
   'Solon',
   'Prince',
   "Horace Silver's",
   'the Oscar Peterson Trio',
   'William Penn High School',
   "Miles Davis's",
   'Davison Ray',
   'Theodore',
   'Simon',
   'Richard',
   'Katherine Coleman'],
  'fn_arr': ['Einstein',
   'Khloé',
   'Domingo',
   'Philly Joe Jones',
   'Horace Silver',
   'Cannonnball Adderly',
   'Oscar Peterson',
   'Heath',
   'Heath',
   'Heath',
   'Miles Davis',
   'Virginia Davison Ray',
   'Thompson',
   'Wenner',
   'Sim',
   'Wenner',
   'Wenner',
   'Theodore "Theo" Simon',
   'Wenner',
   'India Rose',
   'Florence',
   'Little Richard',
   'Dylan',
   'Creola Katherine Coleman']})

In [2]:
import pandas as pd
import numpy as np

data = '~/Downloads/data (18).json'

df = pd.read_json(data)
df

Unnamed: 0,text,label,baseParagraph
0,[e1] Kimberly Noel Kardashian [/e1] was born o...,PARENT-CHILD,
1,[e1] Kimberly Noel Kardashian [/e1] was born o...,PARENT-CHILD,
2,"[e1] She [/e1] has an older sister, [e2] Kourt...",SIBLINGS,
3,"[e1] She [/e1] has an older sister, Kourtney, ...",SIBLINGS,
4,"[e1] She [/e1] has an older sister, Kourtney, ...",SIBLINGS,
...,...,...,...
83,[e1] She [/e1] was the daughter of [e2] David ...,PARENT-CHILD,Conor Cruise O'Brien was born at 44 Leinster R...
84,"[e1] She [/e1] had three sisters, [e2] Hanna [...",SIBLINGS,Conor Cruise O'Brien was born at 44 Leinster R...
85,"[e1] She [/e1] had three sisters, Hanna, [e2] ...",SIBLINGS,Conor Cruise O'Brien was born at 44 Leinster R...
86,"[e1] She [/e1] had three sisters, Hanna, Marga...",SIBLINGS,Conor Cruise O'Brien was born at 44 Leinster R...
