In [3]:
import spacy
from spacy.matcher import PhraseMatcher
from spacy.tokens import Span  

nlp = spacy.load('en_core_web_md')               # setup

with open("first_text.txt") as f:                # running the programm against the first page of the dataset
    test = nlp(f.read())

In [6]:
def show_entities(doc):                          # matching all entities in the text under the "PERSON" label
    if doc.ents:
        for ent in doc.ents:
            if ent.label_ == "PERSON": 
                print(ent.text + ' - ' + ent.label_ + ' - ' + str(spacy.explain(ent.label_)))
    else:
        print('No named entities found.')
        
show_entities(test)

Tara Devi - PERSON - People, including fictional
Chandrima Shaha - PERSON - People, including fictional
Mahlathini - PERSON - People, including fictional
Antiguan - PERSON - People, including fictional
Dharma Man - PERSON - People, including fictional


In [None]:
# the idea now is to single out all entities under the "PERSON" label to match senteneces which are about people

In [None]:
# but first we are going to add all types of pronouns to the "PERSON" entity to also match those:
# since sentences can only include pronouns as well

In [7]:
phrase_list_fp = ["I", "me", "My", "my", "Me", "myself", "Myself", "mine", "Mine"]
phrase_list_male = ["He", "he", "His", "his", "Him", "him", "Hiself", "hiself", "Himself", "himself"]
phrase_list_female = ["She", "she", "Her", "her", "Hers", "hers", "Herself", "herself"]
phrase_list_they = ["They", "they", "Them", "them", "Their", "their", "Themself", "themself", "themselves", "Themselves"]

phrase_list_pronouns = phrase_list_female + phrase_list_male + phrase_list_they

phrase_patterns_fp = [nlp(text) for text in phrase_list_fp]
phrase_patterns_male = [nlp(text) for text in phrase_list_male]
phrase_patterns_female = [nlp(text) for text in phrase_list_female]
phrase_patterns_they = [nlp(text) for text in phrase_list_they]

In [8]:
matcher = PhraseMatcher(nlp.vocab)                       # adding all new pattern lists to the matcher

matcher.add("first_p_pron", None, *phrase_patterns_fp)
matcher.add("male_pron", None, *phrase_patterns_male)
matcher.add("female_pron", None, *phrase_patterns_female)
matcher.add("they_them", None, *phrase_patterns_they)

matches = matcher(test)

In [9]:
matches                               # the new pronoun matches which were not included in the entities matches above

[(7572202541173719006, 95, 96),
 (11199099686251259307, 118, 119),
 (11199099686251259307, 168, 169),
 (3853633594520772826, 187, 188),
 (3853633594520772826, 189, 190),
 (11199099686251259307, 191, 192),
 (7572202541173719006, 207, 208)]

In [10]:
pron = test.vocab.strings[u'PERSON']

new_ents = [Span(test, match[1],match[2],label=pron) for match in matches]

test.ents = list(test.ents) + new_ents

In [11]:
show_entities(test)                              # now we match all people and pronouns in the data

Tara Devi - PERSON - People, including fictional
Chandrima Shaha - PERSON - People, including fictional
her - PERSON - People, including fictional
He - PERSON - People, including fictional
Mahlathini - PERSON - People, including fictional
His - PERSON - People, including fictional
Antiguan - PERSON - People, including fictional
me - PERSON - People, including fictional
I - PERSON - People, including fictional
his - PERSON - People, including fictional
Her - PERSON - People, including fictional
Dharma Man - PERSON - People, including fictional


In [12]:
for sent in test.sents:                      # creating a simple way of determining, whether a sentence is about person or not
    
    print(sent)
    
    person = False
    for ent in sent.ents:
        if ent.label_ == "PERSON":
            person = True       
    if person == True:
        print("This sentence is probably about a person!")
        person = False
    else:
        print("This sentence is probably not about a person.")
        person = False
        
    

So Tara Devi received informal tuition at home.
This sentence is probably about a person!

Chandrima Shaha (born 14 October 1952) is an Indian biologist, currently President of Indian National Science Academy (2020-22), and Professor of Eminence at the National Institute of Immunology.
This sentence is probably about a person!


This sentence is probably not about a person.
Moreover, it should be noted that the country yielded to the principles of the United Nations Commission on International Trade, that is the rules set for cross-border insolvency cases, ensuring fairness for debtors and creditors.
This sentence is probably not about a person.

In 2019, Forbes ranked her the 8th most powerful woman in the world, having been in the top 10 for the last 4 years.
This sentence is probably about a person!

He has also served as executive director of the Caribbean Centre for Money and Finance (CCMF).
This sentence is probably about a person!

In 1972 the line-up of the Mahotella Queens dis

In [20]:
person_sents = 0                         # creating a new list of sentences which only include those about people
non_person_sents = 0                     # aka those which are under our new, expanded "PERSON" label:

sentences = []

for sent in test.sents:
    person = False
    for ent in sent.ents:
        if ent.label_ == "PERSON":
            person = True       
    if person == True:
        person_sents += 1
        sentences.append(sent)
        person = False
    else:
        non_person_sents += 1
        person = False
    
print(person_sents)
print(non_person_sents)
print(sentences)

8
3
[So Tara Devi received informal tuition at home., 
Chandrima Shaha (born 14 October 1952) is an Indian biologist, currently President of Indian National Science Academy (2020-22), and Professor of Eminence at the National Institute of Immunology., 
In 2019, Forbes ranked her the 8th most powerful woman in the world, having been in the top 10 for the last 4 years., 
He has also served as executive director of the Caribbean Centre for Money and Finance (CCMF)., 
In 1972 the line-up of the Mahotella Queens disintegrated after royalty disagreements with Bopape, Mahlathini left the company in a similar dispute with the producer., 
His personal best of 9.91 seconds is the Antiguan national record over 100 m., 
This patient said to me after I replaced his second hip using the PATH®, 
Her grandfather Dharma Man Tuladhar was a philanthropist best known for renovating the Swayambhui stupa in 1918.]


In [None]:
# now we use the matched sentences to try and decide their probability for gender bias:

In [None]:
# each sentence is awarded a point value between 0 and 5, which determines their likelihood for bias
# certain values like quantity of pronouns or presence of "They/Them" weigh heavier in this
# since these are more likely to cause problems for gender bias

In [23]:
def bias_probability(text):                        

    no_count = 0
    low_count = 0
    medium_count = 0
    high_count = 0


    for sent in sentences:

        prob_count = 0
        pers_count = 0
        pron_count = 0

        for token in sent:
            if token.dep_ == "nsubj":
                pers_count += 1
            if token.text in phrase_list_pronouns:
                pron_count += 1
            if token.text in phrase_list_they:
                prob_count += 1

        if pers_count == 1:
            prob_count += 1

        if pers_count >= 2:
            prob_count += 2

        if pron_count >= 1:
            prob_count += 2


        if prob_count == 0:
            no_count +=1
        if prob_count == 1:
            low_count += 1
        if (prob_count == 2 or prob_count == 3):
            medium_count += 1
        if (prob_count == 4 or prob_count == 5):
            high_count += 1


    print("There's " + str(no_count) + " sentence(s) with no, " + str(low_count) + " with low, " + str(medium_count) + " with regular and " + str(high_count) + " sentence(s) with high probability of gender bias.")
    print(""" 
    'no probability' \t \t indicates that there is little chance this sentence would provoke gender bias.
    'low probability' \t \t indicates that the sentence is probably only relevant when dealing with proper names of people.
    'regular probability' \t indicates that the sentence includes gender-specific pronouns and thus could provoke gender bias.
    'high probability' \t \t indicates that the sentence contains a lot of pronouns and/or a use of 'They/them' and is thus very susceptible for gender bias.""")


In [24]:
bias_probability(test)

There's 0 sentence(s) with no, 2 with low, 5 with regular and 1 sentence(s) with high probability of gender bias.
 
    'no probability' 	 	 indicates that there is little chance this sentence would provoke gender bias.
    'low probability' 	 	 indicates that the sentence is probably only relevant when dealing with proper names of people.
    'regular probability' 	 indicates that the sentence includes gender-specific pronouns and thus could provoke gender bias.
    'high probability' 	 	 indicates that the sentence contains a lot of pronouns and/or a use of 'They/them' and is thus very susceptible for gender bias.
