# Topic 8: Portrayal of Characters in Novels

## Preliminaries 
Run this cell. Note that this also loads spaCy.

In [1]:
import sys
sys.path.append(r'\\ad.susx.ac.uk\ITS\TeachingResources\Departments\Informatics\LanguageEngineering\resources')
#sys.path.append(r'/Users/davidw/Documents/teach/NLE/resources')
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import defaultdict,Counter
from itertools import zip_longest
from IPython.display import display
from random import seed
import random
import math
from pylab import rcParams
from operator import itemgetter, attrgetter, methodcaller
import matplotlib.pyplot as plt
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
import seaborn as sns
import csv
from operator import itemgetter, attrgetter, methodcaller
import matplotlib.pylab as pylab
%matplotlib inline
params = {'legend.fontsize': 'large',
          'figure.figsize': (15, 5),
         'axes.labelsize': 'large',
         'axes.titlesize':'large',
         'xtick.labelsize':'large',
         'ytick.labelsize':'large'}
pylab.rcParams.update(params)
get_ipython().magic('matplotlib inline')
import spacy
from sussex_nltk.corpus_readers import AmazonReviewCorpusReader
from nltk.corpus import gutenberg
nlp = spacy.load('en')
from GutenbergCorpus import GutenbergCorpusReader as gcr
reader = gcr.GutenbergCorpusReader()                         ## Sussex constructor


Sussex NLTK root directory is \\ad.susx.ac.uk\ITS\TeachingResources\Departments\Informatics\LanguageEngineering\resources


## Overview
In this topic you will be using spaCy's named entity extractor and the gender classifier that you created in Topic 7 to characterise differences in the way that an author portrays male and female characters.

We will look at how it is possible to capture apspects of the way in which characters are portrayed, in terms of features. Each character in a novel will be represented in terms of a **feature set**. For example, one option is that the features are the verbs that the character is the object of (giving a rough sense of what the character does).

For each character, we will collect a set of features and represent the feature set associated with a character as a special kind of dictionary called a `Counter`. Each feature is used as a key and the counter maps that feature to a weight which could, for example, be a count indicating how many times that feature has been seen.

Given that we have a way to guess the gender of some characters, we can aggregate feature sets across all characters of a given gender. Indeed, we can aggregate male and female feature sets across all novels written by a given author or set of authors.

Once we have done this we will look at how to measure the similarity of the resulting (aggregated) feature sets.

First, however, we look at how you can gain access to the texts of a substantial collection of novels.

### Gutenberg electronic text archive
[Project Gutenberg electronic text archive](http://www.gutenberg.org/) contains around 75,000 free electronic books. We have made 14862 of the text available to you. 


To access these texts run the following cell.

If you are working on your own machine you will need to do the following:
- download and unzip the file `\\ad.susx.ac.uk\ITS\TeachingResources\Departments\Informatics\LanguageEngineering\resources.zip'`
- update the directory in the following cell, changing the string
`'\\ad.susx.ac.uk\ITS\TeachingResources\Departments\Informatics\LanguageEngineering\resources\data\gutenberg_eng'` to be the location of the directory within the resources folder that contains "authors.p" and "cleaned_meta_gutenberg"

In [2]:
reader = gcr.GutenbergCorpusReader() 

# Get a pickled dictionary of the authors in our extended Gutenberg collection
# Key = author name - string which is used to index and retrieve the works of the author
# Value = The names of the files containing the works of that author
authors = reader.get_authors()

# Let's find out how many texts we have in total.

tot = 0
for author in authors:
    tot += len(authors[author])

print("The collection contains text written by {} different authors".format(len(authors)))
print("There are a total of {} texts in the collection".format(tot))

The collection contains text written by 784 different authors
There are a total of 14862 texts in the collection


### Exercise
Run the following cell to see a list of all of the authors with texts in the collection and the number of texts for each author.

In [3]:
for author in authors:
    print("{0}: {1}".format(author,len(authors[author])))

Read, Opie Percival: 9
Duellman, William Edward: 14
Tennyson, Alfred Tennyson, Baron: 11
Hendryx, James B. (James Beardsley): 10
Adams, Samuel Hopkins: 13
Unknown: 102
Brazil, Angela: 26
Ainsworth, William Harrison: 13
Conrad, Joseph: 34
Fisher, Dorothy Canfield: 8
Braddon, M. E. (Mary Elizabeth): 28
Phillips, Rog: 7
Byron, George Gordon Byron, Baron: 9
Ingersoll, Robert Green: 31
Hurll, Estelle M. (Estelle May): 13
Carey, Rosa Nouchette: 10
Tarkington, Booth: 19
Aldrich, Thomas Bailey: 19
Caine, Hall, Sir: 18
Young, F. E. Mills (Florence Ethel Mills): 7
Raymond, Evelyn: 17
Altsheler, Joseph A. (Joseph Alexander): 33
Speed, Nell: 16
Quiller-Couch, Mabel: 9
Daudet, Alphonse: 16
Spinoza, Benedictus de: 12
Leinster, Murray: 33
Webster, Frank V.: 19
Grant, James, archaeologist: 12
Shakespeare, William: 167
Leacock, Stephen: 15
Synge, J. M. (John Millington): 8
Saltus, Edgar: 13
Burroughs, John: 22
Logan, John Alexander: 8
Ritchie, J. Ewing (James Ewing): 10
Dewey, John: 13
Thackeray, Willi

Green, John Richard: 9
Walton, Bryce: 8
Serviss, Garrett Putman: 9
Penrose, Margaret: 18
Holbach, Paul Henri Thiry, baron d': 8
Hume, Fergus: 24
Baring-Gould, S. (Sabine): 31
Sedgwick, Anne Douglas: 13
Harrison, Harry: 11
Wesley, Joseph: 7
Mitton, G. E. (Geraldine Edith): 11
Moodie, Susanna: 13
Burnham, Clara Louise: 8
Farnol, Jeffery: 14
Garland, Hamlin: 22
Russell, Bertrand: 8
Reid, Mayne: 52
Anstey, F.: 18
Mitford, Mary Russell: 13
Old Sleuth: 7
Pohl, Frederik: 8
Curtis, George William: 9
Sudermann, Hermann: 16
Ade, George: 8
Goethe, Johann Wolfgang von: 11
Roy, Lillian Elizabeth: 16
Miller, Alice Duer: 7
Stevenson, Burton Egbert: 15
Yonge, Charlotte M. (Charlotte Mary): 49
Daviess, Maria Thompson: 12
Alger, Horatio, Jr.: 70
Norton, Andre: 14
Young, Filson: 12
Shaara, Michael: 7
Mulford, Clarence Edward: 7
Hawthorne, Julian: 11
Lamb, Charles: 13
Fitzhugh, Percy Keese: 22
Harris, Joel Chandler: 12
Johnston, Mary: 11
Maupassant, Guy de: 31
Rohmer, Sax: 17
Stretton, Hesba: 7
Goodrich, 

Jewett, Sarah Orne: 12
Ewing, Juliana Horatia Gatty: 19
Wairy, Louis Constant: 13
Scott, Walter: 59
Hall, E. Raymond (Eugene Raymond): 16
Huxley, Thomas Henry: 47
Allen, James Lane: 12
Duffield, J. W.: 9
Lynde, Francis: 17
Stables, Gordon: 25
Slesar, Henry: 7
Lowell, James Russell: 10
Mill, John Stuart: 14
Hakluyt, Richard: 9
Miller, Alex. McVeigh, Mrs.: 10
Murray, Gilbert: 8
Marks, Winston K.: 12
Ballantyne, R. M. (Robert Michael): 97
Morris, Gouverneur: 7
Carleton, William: 21
Weinbaum, Stanley Grauman: 7
Barr, Amelia E.: 25
Webster, Noah: 12
Tomlinson, Everett T. (Everett Titsworth): 8
Schmitz, James H.: 11
Turgenev, Ivan Sergeevich: 20
Cory, David: 14
Crabbe, George: 8
Pinkerton, Allan: 7
Lord, John: 19
Nourse, Alan Edward: 23
Le Feuvre, Amy: 9
Optic, Oliver: 53
Norris, Frank: 10
Brereton, F. S. (Frederick Sadleir): 18
Eliot, George: 18
Casanova, Giacomo: 39
Leland, Charles Godfrey: 8
Sayce, A. H. (Archibald Henry): 8
Eastman, Charles Alexander: 8
Gordon, S. D. (Samuel Dickey): 8
M

### Obtaining the text of a novel

The Gutenberg Corpus Reader class provides a method, `get_authors_works`, that returns all of the works of a specified author.

If you run
```
works = reader.get_authors_works(<AUTHOR NAME>)
```
`works` will be a list of dictionaries where each dictionary in the list is one of the works written by the specified author.
- each dictionary in the list has three keys: 
 - "author" that maps to the name of the author
 - "title" that maps to the title of the text
 - "text" that maps to the raw text of the text
 
### Exericse

- Choose one of the authors listed when you run the cell above.
- Adapt the following cell to see the titles of the works available for your chosen author in our collection.

In [39]:
my_authors = ['Read, Opie Percival', 'Oxenham, John', 'Shea, Robert', 'Neville, Kris', 'Marquis, Don',
              'Hugo, Victor', 'Norris, Kathleen Thompson', 'Phillips, Rog', 'Pemberton, Max', 'Munro, Neil']

works = reader.get_authors_works('Read, Opie Percival')
works1 = reader.get_authors_works('Oxenham, John')
works2 = reader.get_authors_works('Shea, Robert')
works3 = reader.get_authors_works('Neville, Kris')
works4 = reader.get_authors_works('Marquis, Don')
works5 = reader.get_authors_works('Hugo, Victor')
works6 = reader.get_authors_works('Norris, Kathleen Thompson')
works7 = reader.get_authors_works('Phillips, Rog')
works8 = reader.get_authors_works('Pemberton, Max')
works9 = reader.get_authors_works('Munro, Neil')
    
for work in works8:
    print(work["title"])


The Garden of Swords
The Iron Pirate: A Plain Tale of Strange Happenings on the Sea
Aladdin of London; Or, Lodestar
The Lady Evelyn: A Story of To-day
The Great White Army
Swords Reluctant
The House Under the Sea: A Romance
The Man Who Drove the Car
White Motley
A Novel
Jewel Mysteries, from a Dealer's Note Book


### Exercise
- In the blank code cell below run spacy on the texts of one of the novels by of your chosen author.

Later you will be exploring a collection of novels, but for now, it is sufficient to work with a single novel.

In [41]:
works9 = reader.get_authors_works('Munro, Neil')
parsed_Munro_1 = nlp(works9[0]["text"])
parsed_Munro_2 = nlp(works9[1]["text"])
parsed_Munro_3 = nlp(works9[2]["text"])
parsed_Munro_4 = nlp(works9[3]["text"])
parsed_Munro_5 = nlp(works9[4]["text"])

works8 = reader.get_authors_works('Pemberton, Max')
parsed_Pemberton_1 = nlp(works8[0]["text"])
parsed_Pemberton_2 = nlp(works8[1]["text"])
parsed_Pemberton_3 = nlp(works8[2]["text"])
parsed_Pemberton_4 = nlp(works8[3]["text"])
parsed_Pemberton_5 = nlp(works8[4]["text"])

### Exercise

In the blank cell below, define a function `get_entities_in(parsed_novel,entity_type)` that takes two inputs:
- `parsed_novel` is the result of running spaCy on the raw text of some novel
- `entity_type` is one of the spaCy entity types, e.g. "PERSON"

The output should be a list of the text for each entity appearing in `parsed_novel` that is of type `entity_type`

spaCy can sometimes return entities with an empty text representation, and you don't want to include these in the output.

It is helpful to normalise the text as follows:
- convert the text for each entity to lower case using `lower()`
- remove any surrounding white space, using `strip()`

Run your function on your parsed novel and look at the first 10 characters.

In [6]:
def get_entities_in(parsed_novel, entity_type):
    return [ent.string.strip() for ent in parsed_novel.ents 
                   if ent.label_ == entity_type and ent.string.strip()]
get_entities_in(parsed_Percival, "PERSON")[:10]

['Annie Green',
 'Garrison',
 'God',
 'Cranceford',
 'John Cranceford',
 'John Cranceford',
 'Harvard',
 'Tom',
 'Louise',
 'Carl Pennington']

In [7]:
# %load solutions/get_entities_in
def get_entities_in(parsed_novel,entity_type):
    return [ent.text.strip().lower() for ent in parsed_novel.ents 
            if ent.label_ == entity_type and ent.text.strip()]

get_entities_in(parsed_Percival,"PERSON")[:10]

['annie green',
 'garrison',
 'god',
 'cranceford',
 'john cranceford',
 'john cranceford',
 'harvard',
 'tom',
 'louise',
 'carl pennington']

### Getting the main characters from a novel

Your next talks is to define a function `get_main_characters(parsed_novel,num_charachters)` that takes two inputs:
- `parsed_novel` is the result of running spaCy on the raw text of some novel
- `num_charachters` is a positive whole number, specifying how many of the main characters should be returned

The output will be a list of the `num_characters` most frequently occurring `"PERSON"` entities in `parsed_novel`.

### Exercise
In the blank cell below, implement `get_main_characters`.
- This function should make use of the `get_entities` function you have just defined
- You can use `Counter` to produce a counter from a list of elements - try `Counter(["a","b","a","c","b"])`
- Once you have a `Counter` you can use `Counter`'s `most_common` method to find the most comment characters

In [8]:
def get_main_characters(parsed_novel, num_characters):
    all_characters = get_entities_in(parsed_novel,"PERSON")
    return [person for person,count in Counter(all_characters).most_common(num_characters)]

get_main_characters(parsed_Percival, 10)

['john',
 'gid',
 'louise',
 'jim',
 'tom',
 'cranceford',
 'mayo',
 'taylor',
 'pennington',
 'margaret']

In [28]:
# %load solutions/get_main_characters

### Extracting Feature Sets for Characters

We now turn to the issue of extracting feature sets for characters or sets of characters.

As explained above, we will store each` feature sets as a `Counter`

### Exercise
- Examine the following code cell and see if you can work out what it is doing.
- Edit the code so that the novel you are working with is being used
- Run the cell and look at the output to establish if your understanding is correct.

In [11]:
def get_interesting_contexts(novels, num_characters):
    
    def of_interest(ent,main_characters):
        return (ent.text.strip().lower() in main_characters 
                and ent.label_ == 'PERSON' 
                and ent.root.head.pos_ == 'VERB')  

    contexts = defaultdict(Counter)    
    for parsed_novel in novels:
        main_characters = get_main_characters(parsed_novel,num_characters)
        for ent in parsed_novel.ents:
            if of_interest(ent,main_characters):
                contexts[ent.text.strip().lower()][ent.root.head.lemma_] += 1
    return contexts

novels = {parsed_Percival} #  use a set here to allow for the possibility of having multiple texts
number_of_characters_per_text = 10
target_contexts = get_interesting_contexts(novels,number_of_characters_per_text)
display(pd.DataFrame.from_dict(target_contexts).applymap(lambda x: '' if math.isnan(x) else x))


Unnamed: 0,cranceford,gid,jim,jim taylor,john,major,margaret,pennington,taylor,tom
admit,1,,,,,,,,,
agree,,,,,,1,,,,
answer,,3,1,,,8,,,1,
appear,,,,,,1,,,,
arise,,1,,,,1,,,,
ask,3,3,,,,6,,1,2,3
attempt,,,,,,,,,,1
awake,,,,,,,,1,,1
be,3,2,9,3,20,10,1,2,4,6
become,,,1,,,1,,,,


### Exercise
Make a copy of the code cell above and adapt the code so that it only counts situations where the person is the subject of the verb, i.e. in an `nsubj` relation. This identifies the things that the person does. 
 
- write your code so that it is possible to specify any set of relations of interest, e.g. both `nsubj` and `dobj`
- run versions of your code for both `nsubj` and `dobj`, the latter revealing things that are done to the person.

In [9]:
# %load solutions/verb_contexts
# %load solutions/verb_contexts

def get_interesting_contexts(novels, rels, num_characters, num_common_verbs):
    
    def of_interest(ent, rels, main_characters):
        return (ent.text.strip().lower() in main_characters 
                and ent.label_ == 'PERSON' 
                and ent.root.head.pos_ == 'VERB'
                and ent.root.dep_ in rels)
    
    def of_interest_copula_adj(ent, adj, main_characters):
        return (ent.text.strip().lower() in main_characters
                and ent.label_ == 'PERSON'
                and ent.root.head.pos_ == 'VERB'
                and adj.pos_ == 'ADJ'
                and adj in ent.root.head.children)
    
    def of_interest_direct_adj(ent, adj, main_characters):
        return (ent.text.strip().lower() in main_characters
                and ent.label_ == 'PERSON'
                and adj.pos_ == 'ADJ'
                and adj in ent.root.children)
    
    def of_interest_copula_noun(ent, noun, main_characters):
        return (ent.text.strip().lower() in main_characters
                and ent.label_ == 'PERSON'
                and ent.root.head.pos_ == 'VERB'
                and noun.pos_ == 'NOUN'
                and noun in ent.root.head.children)
    
    def has_negative(token):
        for child in token.children:
            if child.dep_ == 'neg':
                return "not "
        return ""
    
    def has_advmod(token):
        adv_mod = ""
        for child in token.subtree:
            if child.dep_ == 'advmod' and child.pos_ == 'ADV':
                adv_mod += child.orth_.lower() + " "
        return adv_mod
    
    def has_amod(token):
        a_mod = ""
        for child in token.children:
            if child.dep_ == 'amod' and child.pos_ == 'ADJ' or child.pos_ == 'ADV':
                a_mod += child.orth_.lower() + " "
        return a_mod
    
    contexts = defaultdict(Counter)
    for parsed_novel in novels:
        # get list of all verbs in a novel
        verbs = [verb.string.strip().lower() for verb in parsed_novel if verb.pos_ == 'VERB' and verb.string.strip()]
        # get most common verbs in a novel
        most_common_verbs = [verb[0] for verb in Counter(verbs).most_common(num_common_verbs)]
        main_characters = get_main_characters(parsed_novel, num_characters)
        for ent in parsed_novel.ents:
            if of_interest(ent, rels, main_characters) and not ent.root.head.lemma_ in most_common_verbs:
                contexts[ent.text.strip().lower()][has_negative(ent.root.head) + ent.root.head.lemma_] += 1
            for adj in ent.root.head.children:
                if of_interest_copula_adj(ent, adj, main_characters) and adj.dep_ == 'acomp':
                    contexts[ent.text.strip().lower()][has_negative(adj) + has_advmod(adj) + adj.lemma_] += 1
            for adj in ent.root.children:
                if of_interest_direct_adj(ent, adj, main_characters) and adj.dep_ == 'amod':
                    contexts[ent.text.strip().lower()][has_negative(adj) + has_advmod(adj) + adj.lemma_] += 1
            for noun in ent.root.children:
                if of_interest_copula_noun(ent, noun, main_characters) and noun.dep_ == 'attr':
                    contexts[ent.text.strip().lower()][has_negative(adj) + has_amod(noun) + noun.lemma_] += 1
    return contexts

novels_1 = {parsed_Percival}
number_of_characters_per_text = 15
num_common_verbs = 50
target_rels = {'nsubj', 'dobj'}
target_contexts = get_interesting_contexts(novels_1, target_rels, number_of_characters_per_text, num_common_verbs)
display(pd.DataFrame.from_dict(target_contexts).applymap(lambda x: '' if math.isnan(x) else x))

Unnamed: 0,cranceford,englishman,gid,gideon,jim,jim taylor,john,louise,margaret,mayo,pennington,perdue,taylor,tom,wash sanders
add,,,2,,,,,,,,,,1,,
admit,1,,,,,,,,,,,,,,
answer,,,2,,1,,,1,,,,,1,,
ask,3,1,1,2,,,,,,,,,,3,
awake,,,,,,,,,,,1,,,1,
begin,,,,,,,,2,,,,,,,
big,,,,,1,1,,,,,,,,,
break,,,,,,,,1,,,,,1,,
bring,,,,,,,,1,,1,,,,,
burst,,1,,,,,,,,,,,,,


In [None]:
parsed_Percival = nlp(works[2]["text"])
parsed_Oxenham = nlp(works1[3]["text"])
parsed_Shea = nlp(works2[1]["text"])
parsed_Neville = nlp(works3[0]["text"])
parsed_Marquis = nlp(works4[6]["text"])
parsed_Hugo = nlp(works5[0]["text"])
parsed_Norris = nlp(works6[5]["text"])
parsed_Phillips = nlp(works7[2]["text"])
parsed_Pemberton = nlp(works8[0]["text"])
parsed_Munro = nlp(works9[2]["text"])

In [12]:
def get_interesting_contexts(novels, rels, num_characters, num_common_verbs):
    
    def of_interest(ent, rels, main_characters):
        return (ent.text.strip().lower() in main_characters 
                and ent.label_ == 'PERSON' 
                and ent.root.head.pos_ == 'VERB'
                and ent.root.dep_ in rels)
    
    def of_interest_copula_adj(ent, adj, main_characters):
        return (ent.text.strip().lower() in main_characters
                and ent.label_ == 'PERSON'
                and ent.root.head.pos_ == 'VERB'
                and adj.pos_ == 'ADJ'
                and adj in ent.root.head.children)
    
    def of_interest_direct_adj(ent, adj, main_characters):
        return (ent.text.strip().lower() in main_characters
                and ent.label_ == 'PERSON'
                and adj.pos_ == 'ADJ'
                and adj in ent.root.children)
    
    def of_interest_copula_noun(ent, noun, main_characters):
        return (ent.text.strip().lower() in main_characters
                and ent.label_ == 'PERSON'
                and ent.root.head.pos_ == 'VERB'
                and noun.pos_ == 'NOUN'
                and noun in ent.root.head.children)
    
    def has_negative(token):
        for child in token.children:
            if child.dep_ == 'neg':
                return "not "
        return ""
    
    def has_advmod(token):
        adv_mod = ""
        for child in token.subtree:
            if child.dep_ == 'advmod' and child.pos_ == 'ADV':
                adv_mod += child.orth_.lower() + " "
        return adv_mod
    
    def has_amod(token):
        a_mod = ""
        for child in token.children:
            if child.dep_ == 'amod' and child.pos_ == 'ADJ' or child.pos_ == 'ADV':
                a_mod += child.orth_.lower() + " "
        return a_mod
    
    contexts = defaultdict(Counter)
    for parsed_novel in novels:
        # get list of all verbs in a novel
        verbs = [verb.string.strip().lower() for verb in parsed_novel if verb.pos_ == 'VERB' and verb.string.strip()]
        # get most common verbs in a novel
        most_common_verbs = [verb[0] for verb in Counter(verbs).most_common(num_common_verbs)]
        main_characters = get_main_characters(parsed_novel, num_characters)
        for ent in parsed_novel.ents:
            if of_interest(ent, rels, main_characters) and not ent.root.head.lemma_ in most_common_verbs and not guess_gender(ent.text.strip().lower(), extend_gender_map(gender_map)) == "unknown":
                contexts[guess_gender(ent.text.strip().lower(), extend_gender_map(gender_map))][has_negative(ent.root.head) + ent.root.head.lemma_] += 1
            for adj in ent.root.head.children:
                if of_interest_copula_adj(ent, adj, main_characters) and adj.dep_ == 'acomp' and not guess_gender(ent.text.strip().lower(), extend_gender_map(gender_map)) == "unknown":
                    contexts[guess_gender(ent.text.strip().lower(), extend_gender_map(gender_map))][has_negative(adj) + has_advmod(adj) + adj.lemma_] += 1
            for adj in ent.root.children:
                if of_interest_direct_adj(ent, adj, main_characters) and adj.dep_ == 'amod' and not guess_gender(ent.text.strip().lower(), extend_gender_map(gender_map)) == "unknown":
                    contexts[guess_gender(ent.text.strip().lower(), extend_gender_map(gender_map))][has_negative(adj) + has_advmod(adj) + adj.lemma_] += 1
            for noun in ent.root.children:
                if of_interest_copula_noun(ent, noun, main_characters) and noun.dep_ == 'attr' and not guess_gender(ent.text.strip().lower(), extend_gender_map(gender_map)) == "unknown":
                    contexts[guess_gender(ent.text.strip().lower(), extend_gender_map(gender_map))][has_negative(adj) + has_amod(noun) + noun.lemma_] += 1
    return contexts

novels = {parsed_Percival, parsed_Oxenham, parsed_Shea, parsed_Neville, parsed_Marquis, parsed_Hugo, parsed_Norris, parsed_Phillips, parsed_Pemberton, parsed_Munro}
number_of_characters_per_text = 20
num_common_verbs = 15
target_rels = {'nsubj', 'dobj'}
target_contexts = get_interesting_contexts(novels, target_rels, number_of_characters_per_text, num_common_verbs)
display(pd.DataFrame.from_dict(target_contexts).applymap(lambda x: '' if math.isnan(x) else x))


Unnamed: 0,female,male
absent,,1
absurd,,1
accept,1,
accuse,,1
acknowledge,,1
acquiesce,,1
actually so lightly thunderstruck,,1
adamant,1,
add,13,5
address,1,2


In [11]:
def create_gender_map(dict_reader):
    names_info = defaultdict(lambda: {"gender":"", "freq": 0.0})
    for row in input_file:
        name = row["name"].lower()
        if names_info[name]["freq"] < float(row["freq"]): # is this gender more frequent?
            names_info[name]["gender"] = row["gender"] 
            names_info[name]["freq"] = float(row["freq"])
    gender_map = defaultdict(lambda: "unknown")
    for name in names_info:
        gender_map[name] = names_info[name]["gender"]
    return gender_map

input_file = csv.DictReader(open("names.csv"))
gender_map = create_gender_map(input_file)

def extend_gender_map(gender_map):
    titles = {"male": ["mr", "master", "uncle", "sir", "prince", "archduke", "grand duke", 
                       "duke", "marquis", "count", "baron", "emperor", "king", "pope", 
                       "brother", "father", "cardinal", "captain", "agent"], 
              "female": ["mrs", "ms", "miss", "maid", "madam", "aunt", "princess", 
                         "archduchess", "grand duchess", "duchess", "marchioness", "marquise", 
                         "countess", "baroness", "lady", "queen", "empress", "sister"]}    
    for gender in titles.keys():
        for count in range(len(titles[gender])):
            gender_map[titles[gender][count]] = gender
    return gender_map

def guess_gender(name, gender_map):
    tokenised_name = name.split()
    if gender_map[name] == "unknown":
        if len(tokenised_name) > 1:
            first_name = tokenised_name[0]
            last_name = tokenised_name[len(tokenised_name) - 1]
            gender_map[last_name] = gender_map[first_name]
            return gender_map[first_name]
    return gender_map[name]

def named_entity_counts(document,named_entity_label):
    occurrences = [ent.string.strip() for ent in document.ents
                   if ent.label_ == named_entity_label and ent.string.strip()]
    return Counter(occurrences)

#text = parsed_emma
#entity_type = 'PERSON'
#number_of_entities = 30
#names_with_gender = [(name, guess_gender(name.lower(), extend_gender_map(gender_map))) 
#                     for name, count in named_entity_counts(text,entity_type).most_common(number_of_entities)]
#display(pd.DataFrame(names_with_gender,columns=["Name","Gender"]))

### Exercise
Refine your solution futher by removing the most commonly occurring verbs.
Adapt a copy of the code that you have created when solving the previous exercise so that contexts involving the most  common verbs are not displayed. 

Hint: use a `Counter` to determine the count of each verb in a set of novels, and then use `most_common(n)` to find the most common n verbs.



### Exercise
Spend some time further refining your solution. Your goal shoudl be to indentify other aspects of the context where a character is mentioned that you think will help to provide a richer characterisation of the way that a character is being portrayed by the author.

### Aggregating feature sets

Once you are satisifed with the feature sets that you are able to build for a character, you are ready to undertake your analysis of the way characters are being portrayed based on gender.

- Select a set of novels
- Parse each of the novels with spaCy (this might take a while)
- Determine the settings of any parameters that are needed by the code you have written to produce the character feature sets, e.g.
 - the number characters to consider in each novel
 - the number of most common verbs to disregard
- Run your code that builds feature sets for characters over all of the novels under consideration
- Build two aggregated feature sets, one for all female characters and one for all male characters

In the next cell, we look at how to measure the difference between these two aggregated feature sets and how to assess whether the different you find is significant.

### Measuring the similarity of two feature sets

The code cell below shows how to compare the similarity of two feature sets. This is now explained.

- We are given two feature sets: `A` and `B`.
- Initially, each feature set is represented as a `Counter` which is a dictionary where the keys are the features and each feature (key) is mapped to a positive number which corresponds to the strength (weight) of that feature. 
 - feature set `A` has features `'a', 'b' and 'c'` with weights `1, 2 and 3`, respectively.
 - feature set `B` has features `'b', 'c', 'd' and 'e'` with weights `3, 4, 5 and 6`, respectively.
- Note that they share some, but not all of their features.
- Our goal is to represent both feature sets as lists in such a way that each position in a lists is consistently used for a particular feature
- For example, we could use a list with 5 positions, where the weight of feature `'a'` is held in the first position, the weight of feature `'b'` is held in the second position, and so on. 
 - with this scheme the feature list for `A` would be the list: `[1,2,3,0,0]`, and the feature list for `B` would be `[0,3,4,5,6]`.
- The function `counters_to_feature_lists` takes two feature sets each of which is a `Counter` and returns two lists, one for each of the inputs, where both lists use the same feature representation.
- In the first line of the function, the counters are added together. This is done because the keys of resulting counter (which is named `combined`) can be used to produce consistent mappings of the counters to lists - see lines 2 and 3.
- Once consistent list representations are produced for the two feature sets, we can use the `cosine_similarity` function from `sklearn` as as a measure of how similar the lists are, and therefore, how similar the feature sets are.
- `cosine_similarity` returns a real number between 0 and 1, with 1 indicating that the inputs are identical, and 0 indicating that the two inputs are completely different.


In [73]:
from sklearn.metrics.pairwise import cosine_similarity

A = Counter({'a':1, 'b':2, 'c':3})
B = Counter({'b':3, 'c':4, 'd':5, 'e':6})

def counters_to_feature_lists(counter1,counter2):
    combined = counter1 + counter2 
    list1 = [counter1[key] for key in combined]
    list2 = [counter2[key] for key in combined]
    return list1,list2

L1,L2 = counters_to_feature_lists(A,B)
print(L1)
print(L2)
cosine_similarity([L1], [L2])[0,0]

[1, 2, 3, 0, 0]
[0, 3, 4, 5, 6]


0.51875137593381149

In [24]:
from sklearn.metrics.pairwise import cosine_similarity

female_counter = Counter(target_contexts["female"])
male_counter = Counter(target_contexts["male"])

def counters_to_feature_lists(female_counter, male_counter):
    combined = female_counter + male_counter
    female_list = [female_counter[key] for key in combined]
    male_list = [male_counter[key] for key in combined]
    return female_list, male_list

female, male = counters_to_feature_lists(female_counter, male_counter)
print(female)
print(male)
cosine_similarity([female], [male])[0,0]

[5, 3, 2, 8, 39, 2, 9, 1, 15, 4, 2, 27, 14, 15, 1, 2, 1, 7, 1, 15, 5, 1, 1, 10, 7, 2, 14, 15, 2, 1, 5, 2, 1, 1, 3, 9, 4, 8, 2, 1, 4, 2, 4, 18, 1, 3, 12, 14, 1, 4, 6, 2, 2, 291, 3, 2, 19, 19, 2, 1, 1, 52, 5, 1, 4, 1, 1, 5, 1, 8, 1, 17, 1, 1, 5, 1, 2, 6, 2, 1, 1, 3, 1, 1, 1, 1, 1, 2, 1, 1, 2, 1, 2, 3, 1, 1, 3, 1, 1, 1, 1, 1, 1, 3, 1, 1, 4, 1, 1, 1, 3, 1, 2, 1, 1, 1, 1, 1, 1, 2, 3, 1, 1, 8, 9, 2, 5, 7, 1, 1, 1, 1, 2, 1, 4, 5, 5, 2, 1, 13, 2, 2, 3, 3, 1, 1, 2, 1, 8, 1, 1, 1, 1, 1, 1, 1, 1, 1, 8, 1, 2, 3, 1, 17, 1, 2, 1, 1, 1, 3, 3, 2, 1, 1, 4, 1, 1, 1, 3, 3, 2, 3, 1, 1, 8, 1, 1, 3, 1, 1, 3, 2, 2, 5, 1, 5, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 7, 2, 1, 2, 1, 1, 5, 2, 1, 1, 6, 1, 1, 1, 3, 1, 1, 1, 1, 4, 1, 1, 1, 1, 3, 4, 1, 1, 1, 1, 9, 1, 3, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 3, 1, 1, 6, 1, 2, 1, 1, 4, 1, 2, 2, 2, 1, 1, 2, 1, 2, 6, 1, 4, 1, 2, 4, 1, 1, 2, 3, 1, 1, 2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 3, 1, 1, 2, 1, 1, 1, 1, 2, 3, 3, 1, 1, 1, 1, 2, 7, 1, 1, 1, 1, 1, 1, 2, 1, 1, 3, 1, 1, 2, 3

0.98341245291987445

### When is a difference a significant difference?

The male and female feature sets that you have produced will not be identical, so will have a cosine similarity of less than one.

In order to assess whether there is strong evidence that males and females are portrayed differently in the novels you have chosen, you need to compare this cosine similarity with random non-gender based splits of the characters.

In order to do this, create a random gender classifier and undertake the same analysis with this as above to produce a cosine similarity. By repeating this process several times you will get a sense of how much variation in cosine similiarity is found when doing this.

Another consideration that should be considered is that low cosine similarity values might result from a large difference in the number of male and female characters. To check this, repeat the above process,  making sure that you use feature sets from exactly the same number of male and female characters.

In [28]:
from random import randint

def random_gender_classifier():
    random_gens = {0: "female", 1: "male"}
    return random_gens[randint(0, 1)]

def get_interesting_contexts(novels, rels, num_characters, num_common_verbs):
    
    def of_interest(ent, rels, main_characters):
        return (ent.text.strip().lower() in main_characters 
                and ent.label_ == 'PERSON' 
                and ent.root.head.pos_ == 'VERB'
                and ent.root.dep_ in rels)
    
    def of_interest_copula_adj(ent, adj, main_characters):
        return (ent.text.strip().lower() in main_characters
                and ent.label_ == 'PERSON'
                and ent.root.head.pos_ == 'VERB'
                and adj.pos_ == 'ADJ'
                and adj in ent.root.head.children)
    
    def of_interest_direct_adj(ent, adj, main_characters):
        return (ent.text.strip().lower() in main_characters
                and ent.label_ == 'PERSON'
                and adj.pos_ == 'ADJ'
                and adj in ent.root.children)
    
    def of_interest_copula_noun(ent, noun, main_characters):
        return (ent.text.strip().lower() in main_characters
                and ent.label_ == 'PERSON'
                and ent.root.head.pos_ == 'VERB'
                and noun.pos_ == 'NOUN'
                and noun in ent.root.head.children)
    
    def has_negative(token):
        for child in token.children:
            if child.dep_ == 'neg':
                return "not "
        return ""
    
    def has_advmod(token):
        adv_mod = ""
        for child in token.subtree:
            if child.dep_ == 'advmod' and child.pos_ == 'ADV':
                adv_mod += child.orth_.lower() + " "
        return adv_mod
    
    def has_amod(token):
        a_mod = ""
        for child in token.children:
            if child.dep_ == 'amod' and child.pos_ == 'ADJ' or child.pos_ == 'ADV':
                a_mod += child.orth_.lower() + " "
        return a_mod
    
    contexts = defaultdict(Counter)
    for parsed_novel in novels:
        # get list of all verbs in a novel
        verbs = [verb.string.strip().lower() for verb in parsed_novel if verb.pos_ == 'VERB' and verb.string.strip()]
        # get most common verbs in a novel
        most_common_verbs = [verb[0] for verb in Counter(verbs).most_common(num_common_verbs)]
        main_characters = get_main_characters(parsed_novel, num_characters)
        for ent in parsed_novel.ents:
            if of_interest(ent, rels, main_characters) and not ent.root.head.lemma_ in most_common_verbs and not guess_gender(ent.text.strip().lower(), extend_gender_map(gender_map)) == "unknown":
                contexts[random_gender_classifier()][has_negative(ent.root.head) + ent.root.head.lemma_] += 1
            for adj in ent.root.head.children:
                if of_interest_copula_adj(ent, adj, main_characters) and adj.dep_ == 'acomp' and not guess_gender(ent.text.strip().lower(), extend_gender_map(gender_map)) == "unknown":
                    contexts[random_gender_classifier()][has_negative(adj) + has_advmod(adj) + adj.lemma_] += 1
            for adj in ent.root.children:
                if of_interest_direct_adj(ent, adj, main_characters) and adj.dep_ == 'amod' and not guess_gender(ent.text.strip().lower(), extend_gender_map(gender_map)) == "unknown":
                    contexts[random_gender_classifier()][has_negative(adj) + has_advmod(adj) + adj.lemma_] += 1
            for noun in ent.root.children:
                if of_interest_copula_noun(ent, noun, main_characters) and noun.dep_ == 'attr' and not guess_gender(ent.text.strip().lower(), extend_gender_map(gender_map)) == "unknown":
                    contexts[random_gender_classifier()][has_negative(adj) + has_amod(noun) + noun.lemma_] += 1
    return contexts

novels = {parsed_Percival, parsed_Oxenham, parsed_Shea, parsed_Neville, parsed_Marquis, parsed_Hugo, parsed_Norris, parsed_Phillips, parsed_Pemberton, parsed_Munro}
number_of_characters_per_text = 20
num_common_verbs = 15
target_rels = {'nsubj', 'dobj'}
target_contexts = get_interesting_contexts(novels, target_rels, number_of_characters_per_text, num_common_verbs)
display(pd.DataFrame.from_dict(target_contexts).applymap(lambda x: '' if math.isnan(x) else x))

Unnamed: 0,female,male
absent,1,
absurd,,1
accept,1,
accuse,1,
acknowledge,1,
acquiesce,,1
actually so lightly thunderstruck,,1
adamant,1,
add,7,11
address,2,1


### Extracting Gendered Pronouns 

Since we are interested in quantifying the extent to which authors exhibit gender-based distinctions in the way they in the way that they portray their main characters, it would be useful to base this not only on the contexts of places where a character is mentioned by name, but also when a character is mentioned with a pronoun. The pronouns "he", "she", "his" and "her" indicate the gender of the person being referred to, so provide a reliable source of additional data.

The following code cell shows how these pronouns can be extracted from a text using the `noun_chunks` property of a parsed document.

In [32]:
def gendered_pronoun(np):
    return np.text.strip() in ["he", "she", "her", "his"]

text = parsed_Percival
nounphrases = [[re.sub("\s+"," ",np.text), np.root.head.text] for np in parsed_Percival.noun_chunks if gendered_pronoun(np)]
print("There were {} noun phrases found.".format(len(nounphrases)))
display(pd.DataFrame(nounphrases))

There were 1575 noun phrases found.


Unnamed: 0,0,1
0,he,known
1,he,demanded
2,he,stand
3,he,returned
4,he,sat
5,he,said
6,he,stride
7,he,walked
8,he,served
9,she,besought


In [38]:
def get_interesting_contexts(novels, rels, num_characters, num_common_verbs):
    
    def of_interest(ent, rels, main_characters):
        return (ent.text.strip().lower() in main_characters 
                and ent.root.pos_ == 'PRON' 
                and ent.root.head.pos_ == 'VERB'
                and ent.root.dep_ in rels)
    
    def of_interest_copula_adj(ent, adj, main_characters):
        return (ent.text.strip().lower() in main_characters
                and ent.root.pos_ == 'PRON' 
                and ent.root.head.pos_ == 'VERB'
                and adj.pos_ == 'ADJ'
                and adj in ent.root.head.children)
    
    def of_interest_direct_adj(ent, adj, main_characters):
        return (ent.text.strip().lower() in main_characters
                and ent.root.pos_ == 'PRON' 
                and adj.pos_ == 'ADJ'
                and adj in ent.root.children)
    
    def of_interest_copula_noun(ent, noun, main_characters):
        return (ent.text.strip().lower() in main_characters
                and ent.root.pos_ == 'PRON' 
                and ent.root.head.pos_ == 'VERB'
                and noun.pos_ == 'NOUN'
                and noun in ent.root.head.children)
    
    def has_negative(token):
        for child in token.children:
            if child.dep_ == 'neg':
                return "not "
        return ""
    
    def has_advmod(token):
        adv_mod = ""
        for child in token.subtree:
            if child.dep_ == 'advmod' and child.pos_ == 'ADV':
                adv_mod += child.orth_.lower() + " "
        return adv_mod
    
    def has_amod(token):
        a_mod = ""
        for child in token.children:
            if child.dep_ == 'amod' and child.pos_ == 'ADJ' or child.pos_ == 'ADV':
                a_mod += child.orth_.lower() + " "
        return a_mod
    
    contexts = defaultdict(Counter)
    for parsed_novel in novels:
        # get list of all verbs in a novel
        verbs = [verb.string.strip().lower() for verb in parsed_novel if verb.pos_ == 'VERB' and verb.string.strip()]
        # get most common verbs in a novel
        most_common_verbs = [verb[0] for verb in Counter(verbs).most_common(num_common_verbs)]
        main_characters = ["he", "she", "her", "his"]
        for ent in parsed_novel.noun_chunks:
            if of_interest(ent, rels, main_characters) and not ent.root.head.lemma_ in most_common_verbs:
                contexts[ent.root.text.strip().lower()][has_negative(ent.root.head) + ent.root.head.lemma_] += 1
            for adj in ent.root.head.children:
                if of_interest_copula_adj(ent, adj, main_characters) and adj.dep_ == 'acomp':
                    contexts[ent.root.text.strip().lower()][has_negative(adj) + has_advmod(adj) + adj.lemma_] += 1
            for adj in ent.root.children:
                if of_interest_direct_adj(ent, adj, main_characters) and adj.dep_ == 'amod':
                    contexts[ent.root.text.strip().lower()][has_negative(adj) + has_advmod(adj) + adj.lemma_] += 1
            for noun in ent.root.children:
                if of_interest_copula_noun(ent, noun, main_characters) and noun.dep_ == 'attr':
                    contexts[ent.root.text.strip().lower()][has_negative(adj) + has_amod(noun) + noun.lemma_] += 1
    return contexts

novels = {parsed_Percival, parsed_Oxenham, parsed_Shea, parsed_Neville, parsed_Marquis, parsed_Hugo, parsed_Norris, parsed_Phillips, parsed_Pemberton, parsed_Munro}
number_of_characters_per_text = 20
num_common_verbs = 15
target_rels = {'nsubj', 'dobj'}
target_contexts = get_interesting_contexts(novels, target_rels, number_of_characters_per_text, num_common_verbs)
display(pd.DataFrame.from_dict(target_contexts).applymap(lambda x: '' if math.isnan(x) else x))

Unnamed: 0,he,her,his,she
_,1,,,
abandon,1,,,
able,15,,,3
absolutely indiscoverable,1,,,
absolutely livid,1,,,
absolutely unconscious,,,,1
abstain,1,,,
abuse,1,,,1
accept,9,1,,4
accompany,,1,,
