In [74]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
import gc

In [75]:
DIR_PATH = os.getcwd()
ROOT_PATH = os.path.abspath(os.path.join(DIR_PATH, os.pardir))
DATA_ROOT = os.path.join(ROOT_PATH, 'data')
GAP_DATA_FOLDER = os.path.join(DATA_ROOT, 'gap')
SUB_DATA_FOLDER = os.path.join(DATA_ROOT, 'gendered-pronoun-resolution')
#FAST_TEXT_DATA_FOLDER = os.path.join(DATA_ROOT, 'fasttext-crawl-300d-2M.vec')

In [76]:
print(DATA_ROOT)

C:\Users\Isak\NeuralCoref\data


In [77]:
test_df_path = os.path.join(GAP_DATA_FOLDER, 'gap-development.tsv')
train_df_path = os.path.join(GAP_DATA_FOLDER, 'gap-test.tsv')
dev_df_path = os.path.join(GAP_DATA_FOLDER, 'gap-validation.tsv')

train_df = pd.read_csv(train_df_path, sep='\t')
test_df = pd.read_csv(test_df_path, sep='\t')
dev_df = pd.read_csv(dev_df_path, sep='\t')

In [78]:
test_df.head()

Unnamed: 0,ID,Text,Pronoun,Pronoun-offset,A,A-offset,A-coref,B,B-offset,B-coref,URL
0,development-1,Zoe Telford -- played the police officer girlf...,her,274,Cheryl Cassidy,191,True,Pauline,207,False,http://en.wikipedia.org/wiki/List_of_Teachers_...
1,development-2,"He grew up in Evanston, Illinois the second ol...",His,284,MacKenzie,228,True,Bernard Leach,251,False,http://en.wikipedia.org/wiki/Warren_MacKenzie
2,development-3,"He had been reelected to Congress, but resigne...",his,265,Angeloz,173,False,De la Sota,246,True,http://en.wikipedia.org/wiki/Jos%C3%A9_Manuel_...
3,development-4,The current members of Crime have also perform...,his,321,Hell,174,False,Henry Rosenthal,336,True,http://en.wikipedia.org/wiki/Crime_(band)
4,development-5,Her Santa Fe Opera debut in 2005 was as Nuria ...,She,437,Kitty Oppenheimer,219,False,Rivera,294,True,http://en.wikipedia.org/wiki/Jessica_Rivera


In [79]:
from spacy.lang.en import English
from spacy.pipeline import DependencyParser
import spacy
from nltk import Tree

In [80]:
spacy_model = "en_core_web_lg"
nlp = spacy.load(spacy_model)

In [81]:
def neutralize_term(term, pos):
    male_PRON = ["He", "Him", "he", "him", "himself"]
    female_PRON = ["She", "Her", "she", "her", "herself"]
    neutral_PRON = ["It", "It", "it", "it", "itself"]
    male_DET = ["His", "his"]
    female_DET = ["Her", "her"]
    neutral_DET = ["Its", "its"]
    
    replace = False
    
    if pos == "PRON" and term in male_PRON:
        replace = True
    elif pos == "PRON" and term in female_PRON:
        replace = True               
    elif pos == "DET" and term in male_DET:
        replace = True               
    elif pos == "DET" and term in female_DET:
        replace = True               
    else:
        return term
    
    if replace:       
        if pos == "PRON":
            if term in male_PRON:
                index = male_PRON.index(term)
            elif term in female_PRON:
                index = female_PRON.index(term)
            neutral_term = neutral_PRON[index]

        elif pos == "DET":
            if term in male_DET:
                index = male_DET.index(term)
            elif term in female_DET:
                index = female_DET.index(term)
            neutral_term = neutral_DET[index]
    return neutral_term   

In [121]:
punctuation = [r".", r",", r":", r";", r"?", r"!", r"'s"]

def alter_dataframe(df):
    for row_idx in range(len(df)):
        text = df.loc[row_idx, "Text"]
        offsets = [df.loc[row_idx, "Pronoun-offset"], df.loc[row_idx, "A-offset"], df.loc[row_idx, "B-offset"]]
        new_text, new_offsets = neutralize_and_update(text, nlp(text), offsets)
        df.loc[row_idx, "Text"] = new_text
        df.loc[row_idx, "Pronoun-offset"] = offsets[0]
        df.loc[row_idx, "A-offset"] = offsets[1]
        df.loc[row_idx, "B-offset"] = offsets[2]
        df.loc[row_idx, "Pronoun"] = find_pronoun(new_text, df.loc[row_idx, "Pronoun-offset"])
        if not row_idx % 50:
            print("Progress: %i / %i documents" % (row_idx, len(df)))
    return df

def find_pronoun(text, offset):
    punctuation = [r".", r",", r":", r";", r"?", r"!", r"'s", r"n't"]
    index = offset
    character = text[index]
    pronoun = ""
    while character not in punctuation and character != " ":
        pronoun += character
        index += 1
        character = text[index]
    return pronoun

def neutralize_and_update(text, processed_text, offsets):
    all_gender_prons = ["he", "him", "himself", "she", "her", "herself", "his", "her"]
    char_count = 0
    first = True
    new_text = ""
    
    num_pronouns_before = []
    for entity in range(3):
        offset = offsets[entity]
        substring = nlp(text[0:offset])
        pron_count = 0
        for token in substring:
            if token.lower_ in all_gender_prons:
                pron_count += 1
        num_pronouns_before.append(pron_count)
    
    modifications = []
    for token in processed_text:
        #print(token.text, token.pos_)
        new_token = neutralize_term(token.text, token.pos_)
        if new_token != token.text:            
            diff = len(new_token) - len(token)
            modifications.append(diff)
                     
        if (token.text not in punctuation) and not first:
            new_text += r" "
        first = False
        new_text += new_token
            
    #print("Length modifications:", modifications)
    #print("Number of preceeding pronouns:", num_pronouns_before)
    for idx, offset in enumerate(offsets):
        for mod in range(num_pronouns_before[idx]):
            offset += modifications[mod]
        offsets[idx] = offset
                
    return new_text, offsets

In [160]:
def get_span_from_offset(text, offset):
    
    for token in spans(text):
        
        if offset == token[2]:
            return token[0]

def spans(text):
    
    doc = nlp(text)
    offset = 0
    for token_count, token in enumerate(doc):
        token = str(token)
        #for token_count, token in enumerate(tokens):
        offset = text.find(token, offset)
        yield token_count, token, offset, offset+len(token)
        offset += len(token)

In [166]:
row = 10
text = dev_df.loc[row, "Text"]
pronoun_offset = dev_df.loc[row, "Pronoun-offset"]
dev_df.loc[10,:]

ID                                                    validation-11
Text              This particular government recalled all the Gr...
Pronoun                                                          he
Pronoun-offset                                                  418
A                                                  Ioannis Mamouris
A-offset                                                        273
A-coref                                                       False
B                                                         Kallergis
B-offset                                                        435
B-coref                                                        True
URL                http://en.wikipedia.org/wiki/Dimitrios_Kallergis
Name: 10, dtype: object

In [168]:
span = get_span_from_offset(text, pronoun_offset)
processed_text = nlp(text)
print(processed_text[span])

he


In [122]:
punctuation = [r".", r",", r":", r";", r"?", r"!", r"'s"]

def compare_pronouns(corpus_df, num_docs):
    for row in range(num_docs):
        text = corpus_df.loc[row, "Text"]
        correct_pronoun = corpus_df.loc[row, "Pronoun"]
        pronoun = ""
        start = corpus_df.loc[row, "Pronoun-offset"]

        index = start
        character = text[start]
        while character not in punctuation and character != " ":
            pronoun += character
            index += 1
            character = text[index]
        print("Correct pronoun:", correct_pronoun, ", and in text:", pronoun)

In [124]:
#test_ntr_df = test_df.copy()
dev_ntr_df = alter_dataframe(dev_df.copy())

Progress: 0 / 454 documents
Progress: 50 / 454 documents
Progress: 100 / 454 documents
Progress: 150 / 454 documents
Progress: 200 / 454 documents
Progress: 250 / 454 documents
Progress: 300 / 454 documents
Progress: 350 / 454 documents
Progress: 400 / 454 documents
Progress: 450 / 454 documents


In [136]:
row = 10
text = nlp(dev_df.loc[row, "Text"])
ntr_text = nlp(dev_ntr_df.loc[row, "Text"])
for word in range(len(text)):
    print(text[word], ntr_text[word])

This This
particular particular
government government
recalled recalled
all all
the the
Greek Greek
officers officers
who who
participated participated
in in
the the
anti anti
- -
Ottoman Ottoman
revolutionary revolutionary
movements movements
in in
Thessaly Thessaly
, ,
Epirus Epirus
and and
Macedonia Macedonia
to to
return return
to to
Greece Greece
while while
by by
personal personal
requirement requirement
of of
Kallergis Kallergis
, ,
Otto Otto
's 's
adjutants-- adjutants--
Gennaios Gennaios
Kolokotronis Kolokotronis
, ,
Spyromilios Spyromilios
, ,
Ioannis Ioannis
Mamouris Mamouris
and and
Gardikiotis Gardikiotis
Grivas Grivas
-- --
were were
dismissed dismissed
, ,
while while
the the
hitherto hitherto
Minister Minister
of of
Military Military
Affairs Affairs
, ,
Skarlatos Skarlatos
Soutsos Soutsos
, ,
was was
suspended suspended
. .
When When
he it
was was
minister minister
, ,
Kallergis Kallergis
formed formed
for for
the the
first first
time time
in in
Greece Greece
a a
fire f

In [159]:
row = 5
print(dev_df.loc[row, "Text"])
print(dev_ntr_df.loc[row, "Text"])
dev_ntr_df.head()

No amount of logic can shatter a faith consciously based on a lie.'' According to The Skeptic's Dictionary, an example of this syndrome is evidenced by an event in 1988 when stage magician James Randi, at the request of an Australian news program, coached stage performer Jos* Alvarez to pretend he was channelling a two-thousand-year-old spirit named ``Carlos''.
No amount of logic can shatter a faith consciously based on a lie. ' ' According to The Skeptic's Dictionary, an example of this syndrome is evidenced by an event in 1988 when stage magician James Randi, at the request of an Australian news program, coached stage performer Jos * Alvarez to pretend it was channelling a two - thousand - year - old spirit named ` ` Carlos ' '.


Unnamed: 0,ID,Text,Pronoun,Pronoun-offset,A,A-offset,A-coref,B,B-offset,B-coref,URL
0,validation-1,It admitted making four trips to China and pla...,it,256,Jose de Venecia Jr,208,False,Abalos,241,False,http://en.wikipedia.org/wiki/Commission_on_Ele...
1,validation-2,"Kathleen Nott was born in Camberwell, London. ...",It,185,Ellen,110,False,Kathleen,150,True,http://en.wikipedia.org/wiki/Kathleen_Nott
2,validation-3,"When it returns to its hotel room, a Liberian ...",ring,431,Jason Scott Lee,379,False,Danny,402,True,http://en.wikipedia.org/wiki/Hawaii_Five-0_(20...
3,validation-4,"On 19 March 2007, during a campaign appearance...",it,333,Reucassel,300,True,Debnam,325,False,http://en.wikipedia.org/wiki/Craig_Reucassel
4,validation-5,"By this time, Karen Blixen had separated from ...",,427,Finch Hatton,290,False,Beryl Markham,328,True,http://en.wikipedia.org/wiki/Denys_Finch_Hatton


In [148]:
find_pronoun(dev_df.loc[4, "Text"], dev_df.loc[4, "Pronoun-offset"])

'she'

In [127]:
compare_pronouns(dev_ntr_df, 10)

Correct pronoun: it , and in text: it
Correct pronoun: It , and in text: It
Correct pronoun: ring , and in text: ring
Correct pronoun: it , and in text: it
Correct pronoun:  , and in text: 
Correct pronoun: nd , and in text: nd
Correct pronoun: ng , and in text: ng
Correct pronoun: its , and in text: its
Correct pronoun: wan , and in text: wan
Correct pronoun: hat , and in text: hat


In [156]:
def test_df(df, row_idx):
    text = df.loc[row_idx, "Text"]
    pronoun = df.loc[row_idx, "Pronoun"]
    pronoun_offset = df.loc[row_idx, "Pronoun-offset"]
    pronoun_in_text = find_pronoun(text, pronoun_offset)
    print(text, "\n", pronoun, pronoun_offset, pronoun_in_text)

In [157]:
test_df(test_ntr_df, 50)

She auditions to be a dancer at Cheung Lai Yuen for a better income and a chance to find her father, who is the Emperor. Evergreen Mak as Ko Yan (**) / Kiu Bo-lung (***), a Music Bureau official who looks over Cheung Lai Yuen. In a quarrel with Ming-but, Bo-lung loses his memory and gets half of his face burnt. 
 his 269 his


In [149]:
string_a = "My name is Isak and I am a happy boy."
text = nlp(string_a)

In [154]:
text[3] = "Frederik"

TypeError: 'spacy.tokens.doc.Doc' object does not support item assignment