In [47]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import shakespeare_functions as sf
import shakespeare_dicts as sd

import re
from nltk import word_tokenize
from nltk.corpus import wordnet as wn
from nltk.stem import WordNetLemmatizer
from nltk import RegexpTokenizer

from contractions import contractions_dict
from autocorrect import spell

#Middle English Imports
from cltk.tokenizers.enm import MiddleEnglishWordTokenizer
from cltk.stem.enm import stem
from cltk.tokenizers.enm import MiddleEnglishWordTokenizer

In [2]:
line_df = pd.read_csv('../data/csv/ShakespeareCharacterLines_cleaned.csv', index_col = ['play', 'name', 'line_number'])

In [3]:
colon = slice(None)

- Coriolanus is called Martius for part of the play. Make them the same character.

In [4]:
coriolanus = line_df.loc[('coriolanus', ['MARTIUS', 'CORIOLANUS'], colon), :].copy()

In [5]:
martius_line_max = coriolanus.loc[(colon, 'MARTIUS', colon), :].index.get_level_values(2).max()
coriolanus_line_max = coriolanus.loc[(colon, 'CORIOLANUS', colon), :].index.get_level_values(2).max()

In [6]:
added_index = [('coriolanus', 'CORIOLANUS', x) for x in range(coriolanus_line_max, coriolanus_line_max + martius_line_max + 1)]
coriolanus = pd.concat([coriolanus, pd.DataFrame(columns = coriolanus.columns, index = added_index)])

In [7]:
coriolanus_new_lines = coriolanus.loc[(colon, 'CORIOLANUS', colon), :].shift(martius_line_max)
coriolanus_new_lines.dropna(inplace = True)

In [8]:
martius_lines = coriolanus.loc[(colon, 'MARTIUS', colon), :].copy()
coriolanus = pd.concat([martius_lines, coriolanus_new_lines])

In [9]:
coriolanus.index = coriolanus.index.map(lambda i: (i[0], 'CORIOLANUS', i[2]))

In [10]:
coriolanus

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,line,play_line_number,line_length
play,name,line_number,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
coriolanus,CORIOLANUS,1,"Thanks.--What's the matter, you dissentious ro...",48,119
coriolanus,CORIOLANUS,2,He that will give good words to thee will flat...,50,1008
coriolanus,CORIOLANUS,3,Hang 'em! They say? They'll sit by th' fire an...,52,477
coriolanus,CORIOLANUS,4,They are dissolved. Hang 'em! They said they w...,54,492
coriolanus,CORIOLANUS,5,"Five tribunes to defend their vulgar wisdoms, ...",56,293
coriolanus,CORIOLANUS,...,...,...,...
coriolanus,CORIOLANUS,183,"Hear'st thou, Mars?",1086,19
coriolanus,CORIOLANUS,184,Ha?,1088,3
coriolanus,CORIOLANUS,185,"Measureless liar, thou hast made my heart Too ...",1090,363
coriolanus,CORIOLANUS,186,"Cut me to pieces, Volsces. Men and lads, Stain...",1092,231


In [11]:
line_df.drop(line_df.loc[('coriolanus', ['MARTIUS', 'CORIOLANUS'], colon), :].index, inplace = True)

In [12]:
line_df = pd.concat([line_df, coriolanus])

In [13]:
line_df.sort_index(inplace = True)

- Identify all characters that died by consulting the main texts and outside summaries for verification.

In [14]:
deaths = {'antony-and-cleopatra': ['ENOBARBUS', 'EROS', 'ANTONY', 'IRAS', 'CHARMIAN', 'CLEOPATRA'],
          'coriolanus': ['CORIOLANUS'],
          'cymbeline': ['QUEEN', 'CLOTEN'],
          'hamlet': ['POLONIUS', 'ROSENCRANTZ', 'GUILDENSTERN', 'KING', 'QUEEN', 'LAERTES', 'OPHELIA', 'HAMLET', 'OSRIC'],
          'henry-iv-part-1': ['HOTSPUR', 'VERNON', 'WORCESTER', 'BLUNT'],
          'henry-iv-part-2': ['KING'],
          'henry-v': ['HOSTESS'],
          'henry-vi-part-1': ['MORTIMER', 'BEDFORD', 'TALBOT', 'JOHN TALBOT', 'GARGRAVE', 'SALISBURY'],
          'henry-vi-part-2': ['CARDINAL', 'GLOUCESTER', 'SUFFOLK', 'CADE', 'SOMERSET', 'CLIFFORD', 'STAFFORD', 'BROTHER', 'SAYE', 'CLERK', 'HORNER', 'SOLDIER'],
          'henry-vi-part-3': ['CLIFFORD', 'FATHER', 'SON', 'WARWICK', 'MONTAGUE', 'KING HENRY', 'RUTLAND', 'YORK', 'PRINCE EDWARD'],
          'henry-viii': ['WOLSEY', 'BUCKINGHAM'],
          'julius-caesar': ['TITINIUS', 'CASSIUS', 'CAESAR', 'CINNA', 'BRUTUS', 'CATO'],
          'king-john': ['AUSTRIA', 'ARTHUR', 'KING JOHN', 'CONSTANCE', 'QUEEN ELEANOR'],
          'king-lear': ['FIRST SERVANT', 'OSWALD', 'CORDELIA', 'REGAN', 'EDMUND', 'GLOUCESTER', 'LEAR', 'CORNWALL', 'GONERIL'],
          'loves-labors-lost': ['KING'],
          'macbeth': ['DUNCAN', 'BANQUO', 'LADY MACBETH', 'YOUNG SIWARD', 'MACBETH', 'LADY MACDUFF', 'SON'],
          'othello': ['RODERIGO', 'EMILIA', 'OTHELLO', 'DESDEMONA'], 
          'pericles': ['CLEON', 'DIONYZA', 'ANTIOCHUS', 'DAUGHTER'],
          'richard-ii': ['GAUNT', 'BUSHY', 'GREEN', 'RICHARD', 'GLOUCESTER'],
          'richard-iii': ['CLARENCE', 'RICHARD', 'KING EDWARD', 'PRINCE', 'YORK', 'RIVERS', 'GREY', 'VAUGHAN', 'BRAKENBURY', 'BUCKINGHAM', 'ANNE', 'HASTINGS'],
          'romeo-and-juliet': ['ROMEO', 'JULIET', 'MERCUTIO', 'TYBALT', 'PARIS', 'LADY MONTAGUE'],
          'the-two-noble-kinsmen': ['ARCITE'],
          'the-winters-tale': ['MAMILLIUS', 'HERMIONE', 'ANTIGONUS'], 
          'timon-of-athens': ['TIMON'], 
          'titus-andronicus': ['TITUS', 'MARTIUS', 'QUINTUS', 'MUTIUS', 'TAMORA', 'SATURNINUS', 'AARON', 'BASSIANUS', 'NURSE', 'LAVINIA', 'CHIRON', 'DEMETRIUS'],
          'troilus-and-cressida': ['HECTOR', 'PATROCLUS']}

- GLOUCESTER was originally DUCHESS in richard-ii, where there are two characters with line name DUCHESS: the duchess of York and the duchess of Gloucester. She has been renamed in the text itself.

In [15]:
line_df['character_dies'] = 0

for play, name_arr in deaths.items():
    for name in name_arr:
        line_df.loc[(play, name, colon), 'character_dies'] = 1

- Get number of words per line.

In [16]:
line_df['word_count'] = line_df['line']

In [17]:
line_df['word_count'] = line_df['word_count'].map(lambda line: len(re.findall(r'(\S+)(?=(\s*|\s*\Z))', line)))

- Add new features based on total character, word, and line count.

In [18]:
line_df['total_character_count'] = 0
line_df['max_word_count'] = 0
line_df['max_line_count'] = 0

In [19]:
for index in line_df.index:
    play  = index[0]
    name = index[1]
    
    character_slice = line_df.loc[(play, name, colon)]
    
    line_df.loc[(play, name, colon), 'max_line_count'] = max(list(character_slice.index.get_level_values(2)))
    line_df.loc[(play, name, colon), 'max_word_count'] = max(list(character_slice['word_count']))
    line_df.loc[(play, name, colon), 'total_character_count'] = sum(list(character_slice['line_length']))

- Add percent total character, word, and line count features.

In [20]:
for index in line_df.index:
    play  = index[0]
    name = index[1]
    
    character_slice = line_df.loc[(play, name, colon)]
    play_slice = line_df.loc[(play, colon, colon)]
    
    line_df.loc[(play, name, colon), 'percent_line_count'] = sum(list(character_slice.index.get_level_values(2))) / sum(list(play_slice['word_count']))
    line_df.loc[(play, name, colon), 'percent_word_count'] = sum(list(character_slice['word_count'])) / sum(list(play_slice['word_count']))

- Separate all contractions, replace all poetic apostrophes with correct lettering.

In [21]:
contractions = contractions_dict.copy()
contractions = {re.sub(r'’', "'", key): re.sub(r'’', "'", value) for key, value in contractions.items() if '.' not in key}
contractions = {key:value for key, value in contractions.items() if "'" in key}

In [48]:
shakespeare_contractions_dict = sd.shakespeare_contractions_dict()

In [23]:
for key, value in shakespeare_contractions_dict.items():
        contractions[key] = value

In [24]:
def clean_contractions(sentence):
    sentence = sf.clean_punctuation(sentence)
    for contraction, conversion in contractions.items():
        sentence = re.sub(contraction, conversion, sentence)
        sentence = re.sub(contraction.capitalize(), conversion, sentence)
        sentence = re.sub(contraction.lower(), conversion, sentence)
        
    return sentence

In [25]:
line_df['line'] = line_df['line'].map(clean_contractions)

- Remove "'s"s, they will be removed by lemmatizing or stopwords anyway.

In [26]:
line_df['line'] = line_df['line'].map(lambda x: re.sub("\'s", '', x))

- Modernize anachronistic/poetic words.

In [27]:
line_df['line'] = line_df['line'].map(sf.clean_anachronisms)

In [28]:
corpus = sf.corpusize(line_df, 'line')

- stopword, NER, lemmatize. Use wordnet to identify words to clean.

In [29]:
lemmatizer = WordNetLemmatizer()
tokenizer = RegexpTokenizer('\w+')

unnymable = {}

for line in line_df['line'].values:
    for word in tokenizer.tokenize(line):
        synonyms = wn.synsets(word)

        unnymable[word] = 2
        
        if len(synonyms) > 0:
            hypernyms = synonyms[0].hypernyms()
            unnymable[word] -= 1
            if len(hypernyms) > 0:
                new_word = hypernyms[0].lemma_names()[0]
                unnymable[word] -= 1

In [30]:
unnymable = {key: value for key, value in unnymable.items() if value > 0}

In [33]:
unnymable = {k: v for k, v in sorted(unnymable.items(), key=lambda item: item[1], reverse = True)}

In [46]:
{key: value for key, value in unnymable.items() if re.search('th\Z', key) != None}

{'with': 2,
 'stayeth': 2,
 'constraineth': 2,
 'hateth': 2,
 'knitteth': 2,
 'witnesseth': 2,
 'presenteth': 2,
 'mistaketh': 2,
 'charmeth': 2,
 'quoth': 2,
 'bajazeth': 2,
 'nourisheth': 2,
 'presageth': 2,
 'pleaseth': 2,
 'importeth': 2,
 'purposeth': 2,
 'perceiveth': 2,
 'teacheth': 2,
 'pierceth': 2,
 'provoketh': 2,
 'feedeth': 2,
 'sdeath': 2,
 'placeth': 2,
 'sufficeth': 2,
 'ofergrowth': 2,
 'appeareth': 2,
 'sith': 2,
 'returneth': 2,
 'turneth': 2,
 'singeth': 2,
 'confesseth': 2,
 'deceiveth': 2,
 'liveth': 2,
 'wisheth': 2,
 'doteth': 2,
 'monmouth': 2,
 'menteith': 2,
 'bridgenorth': 2,
 'promiseth': 2,
 'loseth': 2,
 'exceedeth': 2,
 'useth': 2,
 'buildeth': 2,
 'illumineth': 2,
 'bleedeth': 2,
 'standeth': 2,
 'amurath': 2,
 'saith': 2,
 'lurketh': 2,
 'blackheath': 2,
 'feith': 2,
 'delabreth': 2,
 'sendeth': 2,
 'cometh': 2,
 'wherewith': 2,
 'killeth': 2,
 'sueth': 2,
 'approacheth': 2,
 'serveth': 2,
 'perisheth': 2,
 'wanteth': 2,
 'giveth': 2,
 'ceaseth': 2,
 '

- Stem and lemmatize corpus.