In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import shakespeare_functions as sf
import shakespeare_dicts as sd

from importlib import reload

import re
import nltk
from nltk import word_tokenize
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.corpus import wordnet as wn
from nltk.stem import WordNetLemmatizer
from nltk import RegexpTokenizer

from autocorrect import spell

import spacy
import en_core_web_sm

In [2]:
line_df = pd.read_csv('../data/csv/ShakespeareCharacterLines_cleaned.csv', index_col = ['play', 'name', 'line_number'])

In [3]:
colon = slice(None)

- Coriolanus is called Martius for part of the play. Make them the same character.

In [4]:
coriolanus = line_df.loc[('coriolanus', ['MARTIUS', 'CORIOLANUS'], colon), :].copy()

In [5]:
martius_line_max = coriolanus.loc[(colon, 'MARTIUS', colon), :].index.get_level_values(2).max()
coriolanus_line_max = coriolanus.loc[(colon, 'CORIOLANUS', colon), :].index.get_level_values(2).max()

In [6]:
added_index = [('coriolanus', 'CORIOLANUS', x) for x in range(coriolanus_line_max, coriolanus_line_max + martius_line_max + 1)]
coriolanus = pd.concat([coriolanus, pd.DataFrame(columns = coriolanus.columns, index = added_index)])

In [7]:
coriolanus_new_lines = coriolanus.loc[(colon, 'CORIOLANUS', colon), :].shift(martius_line_max)
coriolanus_new_lines.dropna(inplace = True)

In [8]:
martius_lines = coriolanus.loc[(colon, 'MARTIUS', colon), :].copy()
coriolanus = pd.concat([martius_lines, coriolanus_new_lines])

In [9]:
coriolanus.index = coriolanus.index.map(lambda i: (i[0], 'CORIOLANUS', i[2]))

In [10]:
coriolanus

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,line,play_line_number,line_length
play,name,line_number,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
coriolanus,CORIOLANUS,1,"Thanks.--What's the matter, you dissentious ro...",48,119
coriolanus,CORIOLANUS,2,He that will give good words to thee will flat...,50,1008
coriolanus,CORIOLANUS,3,Hang 'em! They say? They'll sit by th' fire an...,52,477
coriolanus,CORIOLANUS,4,They are dissolved. Hang 'em! They said they w...,54,492
coriolanus,CORIOLANUS,5,"Five tribunes to defend their vulgar wisdoms, ...",56,293
coriolanus,CORIOLANUS,...,...,...,...
coriolanus,CORIOLANUS,183,"Hear'st thou, Mars?",1086,19
coriolanus,CORIOLANUS,184,Ha?,1088,3
coriolanus,CORIOLANUS,185,"Measureless liar, thou hast made my heart Too ...",1090,363
coriolanus,CORIOLANUS,186,"Cut me to pieces, Volsces. Men and lads, Stain...",1092,231


In [11]:
line_df.drop(line_df.loc[('coriolanus', ['MARTIUS', 'CORIOLANUS'], colon), :].index, inplace = True)

In [12]:
line_df = pd.concat([line_df, coriolanus])

In [13]:
line_df.sort_index(inplace = True)

- Identify all characters that died by consulting the main texts and outside summaries for verification.

In [14]:
deaths = {'antony-and-cleopatra': ['ENOBARBUS', 'EROS', 'ANTONY', 'IRAS', 'CHARMIAN', 'CLEOPATRA'],
          'coriolanus': ['CORIOLANUS'],
          'cymbeline': ['QUEEN', 'CLOTEN'],
          'hamlet': ['POLONIUS', 'ROSENCRANTZ', 'GUILDENSTERN', 'KING', 'QUEEN', 'LAERTES', 'OPHELIA', 'HAMLET', 'OSRIC'],
          'henry-iv-part-1': ['HOTSPUR', 'VERNON', 'WORCESTER', 'BLUNT'],
          'henry-iv-part-2': ['KING'],
          'henry-v': ['HOSTESS'],
          'henry-vi-part-1': ['MORTIMER', 'BEDFORD', 'TALBOT', 'JOHN TALBOT', 'GARGRAVE', 'SALISBURY'],
          'henry-vi-part-2': ['CARDINAL', 'GLOUCESTER', 'SUFFOLK', 'CADE', 'SOMERSET', 'CLIFFORD', 'STAFFORD', 'BROTHER', 'SAYE', 'CLERK', 'HORNER', 'SOLDIER'],
          'henry-vi-part-3': ['CLIFFORD', 'FATHER', 'SON', 'WARWICK', 'MONTAGUE', 'KING HENRY', 'RUTLAND', 'YORK', 'PRINCE EDWARD'],
          'henry-viii': ['WOLSEY', 'BUCKINGHAM'],
          'julius-caesar': ['TITINIUS', 'CASSIUS', 'CAESAR', 'CINNA', 'BRUTUS', 'CATO'],
          'king-john': ['AUSTRIA', 'ARTHUR', 'KING JOHN', 'CONSTANCE', 'QUEEN ELEANOR'],
          'king-lear': ['FIRST SERVANT', 'OSWALD', 'CORDELIA', 'REGAN', 'EDMUND', 'GLOUCESTER', 'LEAR', 'CORNWALL', 'GONERIL'],
          'loves-labors-lost': ['KING'],
          'macbeth': ['DUNCAN', 'BANQUO', 'LADY MACBETH', 'YOUNG SIWARD', 'MACBETH', 'LADY MACDUFF', 'SON'],
          'othello': ['RODERIGO', 'EMILIA', 'OTHELLO', 'DESDEMONA'], 
          'pericles': ['CLEON', 'DIONYZA', 'ANTIOCHUS', 'DAUGHTER'],
          'richard-ii': ['GAUNT', 'BUSHY', 'GREEN', 'RICHARD', 'GLOUCESTER'],
          'richard-iii': ['CLARENCE', 'RICHARD', 'KING EDWARD', 'PRINCE', 'YORK', 'RIVERS', 'GREY', 'VAUGHAN', 'BRAKENBURY', 'BUCKINGHAM', 'ANNE', 'HASTINGS'],
          'romeo-and-juliet': ['ROMEO', 'JULIET', 'MERCUTIO', 'TYBALT', 'PARIS', 'LADY MONTAGUE'],
          'the-two-noble-kinsmen': ['ARCITE'],
          'the-winters-tale': ['MAMILLIUS', 'HERMIONE', 'ANTIGONUS'], 
          'timon-of-athens': ['TIMON'], 
          'titus-andronicus': ['TITUS', 'MARTIUS', 'QUINTUS', 'MUTIUS', 'TAMORA', 'SATURNINUS', 'AARON', 'BASSIANUS', 'NURSE', 'LAVINIA', 'CHIRON', 'DEMETRIUS'],
          'troilus-and-cressida': ['HECTOR', 'PATROCLUS']}

- GLOUCESTER was originally DUCHESS in richard-ii, where there are two characters with line name DUCHESS: the duchess of York and the duchess of Gloucester. She has been renamed in the text itself.

In [15]:
line_df['character_dies'] = 0

for play, name_arr in deaths.items():
    for name in name_arr:
        line_df.loc[(play, name, colon), 'character_dies'] = 1

- Get number of words per line.

In [16]:
line_df['word_count'] = line_df['line']

In [17]:
line_df['word_count'] = line_df['word_count'].map(lambda line: len(re.findall(r'(\S+)(?=(\s*|\s*\Z))', line)))

- Add new features based on total character, word, and line count.

In [18]:
line_df['total_character_count'] = 0
line_df['max_word_count'] = 0
line_df['max_line_count'] = 0

In [19]:
index_without_number = set(zip(line_df.index.get_level_values(0), line_df.index.get_level_values(1)))

In [20]:
for play, name in index_without_number:
    character_slice = line_df.loc[(play, name, colon)]
    
    line_df.loc[(play, name, colon), 'max_line_count'] = max(list(character_slice.index.get_level_values(2)))
    line_df.loc[(play, name, colon), 'max_word_count'] = max(list(character_slice['word_count']))
    line_df.loc[(play, name, colon), 'total_character_count'] = sum(list(character_slice['line_length']))

- Add percent total character, word, and line count features.

In [21]:
for play, name in index_without_number:
    character_slice = line_df.loc[(play, name, colon)]
    play_slice = line_df.loc[(play, colon, colon)]
    
    line_df.loc[(play, name, colon), 'percent_line_count'] = sum(list(character_slice.index.get_level_values(2))) / sum(list(play_slice['word_count']))
    line_df.loc[(play, name, colon), 'percent_word_count'] = sum(list(character_slice['word_count'])) / sum(list(play_slice['word_count']))

- Remove excess punctuation.

In [22]:
line_df['line'] = line_df['line'].map(sf.clean_punctuation)

- Separate all contractions, replace all poetic apostrophes with correct lettering.

In [23]:
line_df['line'] = line_df['line'].map(sf.clean_contractions)

- Remove "'s"s, they will be removed by lemmatizing or stopwords anyway.

In [24]:
line_df['line'] = line_df['line'].map(lambda x: re.sub("\'s", '', x))

- Modernize anachronistic/poetic words.

In [25]:
line_df['line'] = line_df['line'].map(sf.clean_anachronisms)

In [26]:
line_df['line'] = line_df['line'].map(lambda x: re.sub(r"-", ' ', x))

- Bring in spaCy for lemmatization, stopwords, NER, and POS tagging. 

In [27]:
nlp = en_core_web_sm.load()

- Add personalized stopwords.

In [28]:
for stop in sf.get_stopwords():
    nlp.vocab[stop].is_stop = True

- Get sentiment analysis of lines without proper nouns.

In [29]:
line_df['pos_sentiment'] = 0
line_df['neg_sentiment'] = 0
line_df['neu_sentiment'] = 0
line_df['compound_sentiment'] = 0

In [30]:
sia = SentimentIntensityAnalyzer()

for row in line_df.iterrows():
    line = row[1]['line']
    
    doc = nlp(line)
    to_sent = ""
    
    for token in doc:
        if token.pos_ != 'PROPN':
            to_sent += token.text + " "
       
    sent = sia.polarity_scores(to_sent)
    
    for key, value in sent.items():
        line_df.loc[row[0], f'{key}_sentiment'] = value

- Final tokenization: remove stopwords, add columns for words, hypernyms, and word types

In [31]:
def dict_increment(a_dict, word):
    if word in a_dict:
        a_dict[word] += 1
    else:
        a_dict[word] = 1

In [32]:
line_df.rename(columns = {'line': 'character_line'}, inplace = True)

In [33]:
%%time
character_names = set(line_df.index.get_level_values(1))

for row in line_df.iterrows():
    line = row[1]['character_line']
    
    doc = nlp(line)
    block = {}
    
    for token in doc:
        features = []
        
        if token.text.upper() in character_names:
            features.extend(["character_name", "PROPN"])
        else:
            if not token.is_stop:
                features.extend([token.lemma_.lower(), token.pos_])

                syn_text = wn.synsets(token.text.lower())
                syn_lemma = wn.synsets(token.lemma_.lower())

                #Add hypernym/synonym columns in priority order: word hypernym > lemma hypernym > word synonym > lemma synonym.
                if len(syn_text) > 0 and len(syn_text[0].hypernyms()) > 0:
                    dict_increment(block, syn_text[0].hypernyms()[0].lemma_names()[0].lower() + "_hyp")
                elif len(syn_lemma) > 0 and len(syn_lemma[0].hypernyms()) > 0:
                    dict_increment(block, syn_lemma[0].hypernyms()[0].lemma_names()[0].lower() + "_hyp")
                elif len(syn_text) > 0:
                    dict_increment(block, syn_text[0].lemma_names()[0].lower() + "_syn")
                elif len(syn_lemma) > 0:
                    dict_increment(block, syn_lemma[0].lemma_names()[0].lower() + "_syn")
                
        for item in features:
            dict_increment(block, item)

    for word, count in block.items():
        line_df.loc[row[0], word] = count

Wall time: 17min 57s


In [34]:
line_df.shape

(31831, 22281)

In [35]:
sums = line_df.drop(columns = line_df.columns[:14]).sum()

In [36]:
sums = {key: value for key, value in sums.to_dict().items() if value < 40000}

In [37]:
line_df.fillna(value = 0, inplace = True)

- Create aggregated dataframe by character for modeling.

In [38]:
to_drop = list(line_df.columns[:10])
to_drop.remove('character_dies')

In [39]:
char_df = line_df.drop(columns = to_drop).copy()

In [47]:
new_cols = [key + '_mean' for key in char_df.columns]
new_cols.extend([key + '_median' for key in char_df.columns])
new_cols.extend([key + '_std' for key in char_df.columns])

new_char_df = pd.DataFrame(index = index_without_number, columns = new_cols)

for play, name in index_without_number:
    character_slice = char_df.loc[(play, name, colon)]

    means = character_slice.mean().add_suffix('_mean')
    medians =   character_slice.median().add_suffix('_median')
    stds = character_slice.std(ddof = 0).add_suffix('_std')
    
    new_row = pd.concat([means, medians, stds], axis = 0)

    new_char_df.loc[(play, name)] = new_row
    

- Export data.

In [50]:
new_char_df.drop(columns = ['character_dies_median', 'character_dies_std'], inplace = True)

In [51]:
new_char_df.rename(columns = {'character_dies_mean': 'character_dies'}, inplace = True)

In [48]:
line_df.to_csv('../data/csv/ShakespeareCharacterLines_engineered.csv', index_label = ['play', 'name', 'line_number'])

In [52]:
new_char_df.to_csv('../data/csv/ShakespeareCharacterLines_character_corpus.csv', index_label = ['play', 'name'])

- On to Step 4 ->