In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
import gc

In [3]:
DIR_PATH = os.getcwd()
ROOT_PATH = os.path.abspath(os.path.join(DIR_PATH, os.pardir))
DATA_ROOT = os.path.join(ROOT_PATH, 'data')
GAP_DATA_FOLDER = os.path.join(DATA_ROOT, 'gap')
SUB_DATA_FOLDER = os.path.join(DATA_ROOT, 'gendered-pronoun-resolution')
#FAST_TEXT_DATA_FOLDER = os.path.join(DATA_ROOT, 'fasttext-crawl-300d-2M.vec')

In [4]:
train_df_path = os.path.join(GAP_DATA_FOLDER, 'gap-development.tsv')
dev_df_path = os.path.join(GAP_DATA_FOLDER, 'gap-validation.tsv')
test_df_path = os.path.join(GAP_DATA_FOLDER, 'gap-test.tsv')

train_df = pd.read_csv(train_df_path, sep='\t')
dev_df = pd.read_csv(dev_df_path, sep='\t')
test_df = pd.read_csv(test_df_path, sep='\t')

In [5]:
train_df.head()

Unnamed: 0,ID,Text,Pronoun,Pronoun-offset,A,A-offset,A-coref,B,B-offset,B-coref,URL
0,development-1,Zoe Telford -- played the police officer girlf...,her,274,Cheryl Cassidy,191,True,Pauline,207,False,http://en.wikipedia.org/wiki/List_of_Teachers_...
1,development-2,"He grew up in Evanston, Illinois the second ol...",His,284,MacKenzie,228,True,Bernard Leach,251,False,http://en.wikipedia.org/wiki/Warren_MacKenzie
2,development-3,"He had been reelected to Congress, but resigne...",his,265,Angeloz,173,False,De la Sota,246,True,http://en.wikipedia.org/wiki/Jos%C3%A9_Manuel_...
3,development-4,The current members of Crime have also perform...,his,321,Hell,174,False,Henry Rosenthal,336,True,http://en.wikipedia.org/wiki/Crime_(band)
4,development-5,Her Santa Fe Opera debut in 2005 was as Nuria ...,She,437,Kitty Oppenheimer,219,False,Rivera,294,True,http://en.wikipedia.org/wiki/Jessica_Rivera


In [6]:
# Initialize spaCy model
from spacy.lang.en import English
from spacy.pipeline import DependencyParser
import spacy
from nltk import Tree

spacy_model = "en_core_web_lg"
nlp = spacy.load(spacy_model)

In [7]:
# Define gendered pronouns
male_PRON = ["He", "Him", "His", "Himself", "he", "him", "his", "himself"]
female_PRON = ["She", "Her", "Hers", "Herself", "she", "her", "hers", "herself"]
neutral_PRON = ["It", "It", "Its", "Itself", "it", "it", "it",  "itself"]
male_DET = ["His", "his"]
female_DET = ["Her", "her"]
neutral_DET = ["Its", "its"]

all_gender_prons = ["he", "him", "his", "himself", "she", "her", "hers", "herself", "his", "her"]

# Define punctuation
punctuation = [r".", r",", r":", r";", r"?", r"!", r"'s"]

In [8]:
# For a term, check if it needs replacing, then replace

def find_neutral_term(term, pos):    
    if pos == "PRON":
        if term in male_PRON:
            index = male_PRON.index(term)
        elif term in female_PRON:
            index = female_PRON.index(term)
        else:
            return term # Failproof
        neutral_term = neutral_PRON[index]
        # Error here, because index not assigned.
        # Then, must have word in all_gender_prons but not in male_PRON or female_PRON?
        # Answer: (fe)male personal pronoun has mistakenly been labeled as "DET"
        
    elif pos == "DET":
        if term in male_DET:
            index = male_DET.index(term)
        elif term in female_DET:
            index = female_DET.index(term)
        else:
            return term # Failproof
        neutral_term = neutral_DET[index]
    # Potential else-clause: "She" is observed to have "PRP" pos label. Ignore
    else:
        return term
    return neutral_term

In [9]:
# Find new character offset, given text, mention and old character offset

def get_char_offset(text, mention, offset):
    left_start = offset
    right_start = offset
    left = True
    index = -1
    
    while index == -1:
        if left:
            index = text.find(mention, left_start)
            left_start -= 1
            left = False
        else:
            index = text.find(mention, right_start)
            right_start += 1
            left = True
    return index

In [10]:
# Process spaCy text into neutralized format

def neutralize_text(nlp_doc, pronoun_span):
    new_text = ""
    pronoun = ""
    for token_idx, token in enumerate(nlp_doc):
        if token.lower_ in all_gender_prons:
            new_token = find_neutral_term(token.text, token.pos_)
        else:
            new_token = token.text
        new_text += new_token
        if len(token.text) != len(token.text_with_ws):
            new_text += " "
        if token_idx == pronoun_span:
            pronoun = new_token
    return new_text, pronoun

In [11]:
# Get span (word offset) from char offset

def get_span_from_offset(nlp_doc, text, offset):   
    for token in spans(nlp_doc, text):
        
        if offset == token[2]:
            return token[0]

def spans(nlp_doc, text):
    offset = 0
    for token_count, token in enumerate(nlp_doc):
        token = str(token)
        offset = text.find(token, offset)
        yield token_count, token, offset, offset+len(token)
        offset += len(token)

In [14]:
# Iterate through dataframe, update text, pronoun and all three offsets

def augment_df(df):
    for row_idx in range(len(df)):
        text = df.loc[row_idx, "Text"]
        nlp_doc = nlp(text) # Call this just once per row in df
        
        pronoun_offset = df.loc[row_idx, "Pronoun-offset"]
        pronoun_span = get_span_from_offset(nlp_doc, text, pronoun_offset)
        text, pronoun = neutralize_text(nlp_doc, pronoun_span)
        df.loc[row_idx, "Text"] = text
        df.loc[row_idx, "Pronoun"] = pronoun
        
        # Update character offsets
        mentions = ["Pronoun", "A", "B"]
        mention_offsets = ["Pronoun-offset", "A-offset", "B-offset"]
        for idx in range(len(mentions)):
            mention = str( df.loc[row_idx, mentions[idx]] )
            offset = df.loc[row_idx, mention_offsets[idx]]
            df.loc[row_idx, mention_offsets[idx]] = get_char_offset(text, mention, offset)
            
        #if not row_idx % 50:
        #    print("Progress: %i / %i documents" % (row_idx, len(df)))
    print("Dataframe successfully augmented.")
    return df

In [131]:
train_ntr_df = augment_df(train_df.copy()) # Takes 1 min 20 s
dev_ntr_df = augment_df(dev_df.copy()) # Takes in 18 s
test_ntr_df = augment_df(test_df.copy()) # Takes 1 min 20 s

Dataframe successfully augmented.
Dataframe successfully augmented.
Dataframe successfully augmented.


In [18]:
def compare_mentions(df, num_docs):
    for row in range(num_docs):
        text = df.loc[row, "Text"]
        pronoun = df.loc[row, "Pronoun"]
        a = df.loc[row, "A"]
        b = df.loc[row, "B"]
        text_pronoun = get_mention(text, df.loc[row, "Pronoun-offset"])
        text_a = get_mention(text, df.loc[row, "A-offset"])
        text_b = get_mention(text, df.loc[row, "B-offset"])   
        
        print("Correct mentions:", pronoun, a, b, ", and in text:", text_pronoun, text_a, text_b)
        
def get_mention(text, offset):
    mention = ""
    character = text[offset]
    while character not in punctuation and character != " ":
        mention += character
        offset += 1
        character = text[offset]
    return mention

In [19]:
compare_mentions(dev_ntr_df, 10)

Correct mentions: it Jose de Venecia Jr Abalos , and in text: it Jose Abalos
Correct mentions: It Ellen Kathleen , and in text: It Ellen Kathleen
Correct mentions: its Jason Scott Lee Danny , and in text: its Jason Danny
Correct mentions: it Reucassel Debnam , and in text: it Reucassel Debnam
Correct mentions: it Finch Hatton Beryl Markham , and in text: it Finch Beryl
Correct mentions: it James Randi Jos* Alvarez , and in text: it James Jos*
Correct mentions: It von Sanders Faik Pasha , and in text: It von Faik
Correct mentions: its Colin Jake Burns , and in text: its Colin Jake
Correct mentions: it Scott Cowan , and in text: it Scott Cowan
Correct mentions: its Beverley Callard Liz , and in text: its Beverley Liz


In [132]:
train_ntr_df.to_csv('../data/gap/gap-development-neutral-it.tsv', sep='\t', index=False)
dev_ntr_df.to_csv('../data/gap/gap-validation-neutral-it.tsv', sep='\t', index=False)
test_ntr_df.to_csv('../data/gap/gap-test-neutral-it.tsv', sep='\t', index=False)

In [90]:
df = test_df
for row_idx in range(len(df)):
    female_pronouns = ["her", "hers"]
    text = df.loc[row_idx, "Text"]
    doc = nlp(text)
    for token in doc:
        #if token.lower_ in female_pronouns:
        if token.lower_ == "hers":
            print(token.text, token.pos_)

hers PRON
hers PRON
