- Data can be found at https://shakespeare.folger.edu/download-the-folger-shakespeare-complete-set/. Text data was selected for this project. Because of this, it is for non-commercial purposes only.

***Step 1: Data Import and Formatting***

In [1]:
import pandas as pd
import numpy as np
import re
import os

***Section 1: Process outline***

In [2]:
file = open('../data/texts/alls-well-that-ends-well_TXT_FolgerShakespeare.txt')
play = file.read()
file.close()

- Strip character descriptions and attributive text.

In [3]:
start_opener, end_opener = re.search(r'[\S\s]*(?=(ACT 1))', play).span()

In [4]:
play = play[end_opener:]

- Remove epilogue.

In [5]:
play = re.sub(r'(?<=(EPILOGUE))([\s\S]*)', '', play)

In [6]:
play = re.sub(r'EPILOGUE', '', play)

- Strip all stage directions.

In [7]:
play = re.sub(r'(\[[^]]*\])', '', play)

In [8]:
play_frame = pd.DataFrame({'play': [], 'name': [], 'line': [], 'line_number': []})

- Strip acts, scenes, and accompanying dividers.

In [9]:
re.findall(r'(ACT [0-9]+)', play)

['ACT 1', 'ACT 2', 'ACT 3', 'ACT 4', 'ACT 5']

In [10]:
play = re.sub(r'(ACT [0-9]+)', '', play)

In [11]:
play = re.sub(r'(Scene [0-9]+)', '', play)

In [12]:
play = re.sub('=+', '', play)

- Create array of all lines.

In [13]:
#All sets of characters starting with words in all capital letters, and ended by two newlines.
lines = re.findall(r'(((([A-Z]{2,}[ ,\n])+)(?!(([A-Z]{2,}[ ,\n])+)))([\S\s]+?))(?=([A-Z]{2}|\Z))',play)

In [14]:
lines = [line[0] for line in lines]

In [15]:
lines[0]

'COUNTESS  In delivering my son from me, I bury a second\nhusband.\n\n'

In [16]:
len(lines)

936

- Add all lines to data frame.

In [17]:
entries = {'play': [], 'name': [], 'line': [], 'line_number': []}
for i in range(0, len(lines)):
    line = lines[i]

    entries['play'].append('alls-well-that-ends-well')
    this_name = re.search(r'((([A-Z]{2,}[ ,\n])+)(?!(([A-Z]{2,}[ ,\n])+)))', line).group(0)
    this_name = re.sub(r'( )(?![A-Z])|[\n,]', '', this_name)
    
    entries['line'].append(re.search(r'(?<=' + this_name + ')([\s\S]*)', line).group(0))
    entries['name'].append(this_name)
    entries['line_number'].append(len([1 for name in entries['name'] if name == this_name]) + 1)
    
play_frame = pd.DataFrame(entries)

In [18]:
play_frame.head()

Unnamed: 0,play,name,line,line_number
0,alls-well-that-ends-well,COUNTESS,"In delivering my son from me, I bury a secon...",2
1,alls-well-that-ends-well,BERTRAM,"And I in going, madam, weep o'er my\nfather'...",2
2,alls-well-that-ends-well,LAFEW,"You shall find of the King a husband, madam;...",2
3,alls-well-that-ends-well,COUNTESS,What hope is there of his Majesty's\namendme...,3
4,alls-well-that-ends-well,LAFEW,"He hath abandoned his physicians, madam,\nun...",3


***Section 2: General Import***
- Generalized import of data based on previous section as example.

In [19]:
def open_file(name):
    file = open(name)
    play = file.read()
    file.close()
    return play

In [20]:
def strip_opener(play):
    start_opener, end_opener = re.search(r'[\S\s]*(?=(ACT 1))', play).span()
    return play[end_opener:]

In [21]:
def strip_epilogue(play):
    play = re.sub(r'(?<=(EPILOGUE))([\s\S]*)', '', play)
    return re.sub(r'EPILOGUE', '', play)

In [22]:
def strip_stage_directions(play):
    return re.sub(r'(\[[^]]*\])', '__stage_direction__', play)

In [23]:
def strip_acts(play):
    return re.sub(r'(ACT [0-9]+)', '', play)

In [24]:
def strip_scenes(play):
    return re.sub(r'(Scene [0-9]+)', '', play)

In [25]:
def strip_dividers(play):
    return re.sub('=+', '', play)

In [26]:
def scrub_name(name):
    #scrub 'and's
    name = re.sub(r'[a-z]', '', name)
    #scrub trailing and preceding punctuation and whitespace
    name = re.sub(r'(?<=\A)([\s ,]+)(?=[A-Z])', '', name)
    name = re.sub(r'(?<=[A-Z])([\s ,]+)(?=\Z)', '', name)
    return name

In [27]:
def append_play_lines(entries, play, play_name):
    #All character names in the file are in all capitals and are at least 2 letters long. This finds all contiguous name separated by ',', ' ', and 'and' to get the name(s).
    #All subsequent characters are the charater line, ending at play end or at the next name.
    lines = re.findall(r'(((([A-Z]{2,}(,*\s+and\s+|[,\.]{0,1}\s+|\/))+)(?!(([A-Z]{2,}(,*\s+and\s+|[,\.]{0,1}\s+|\/))+)))([\S\s]+?))(?=([A-Z]{2}|\Z))',play)
    lines = [line[0] for line in lines]

    for i in range(0, len(lines)):
        line = lines[i]

        this_name = re.search(r'((([A-Z]{2,}(,*\s+and\s+|[,\.]{0,1}\s+|\/))+)(?!(([A-Z]{2,}(,*\s+and\s+|[,\.]{0,1}\s+|\/))+)))', line).group(0)
        
        #Add multiple lines if multiple characters speaking
        multilines = []
        if len(this_name.split('/')) > 1:
            multilines.extend(this_name.split('/')) 
        if len(this_name.split(',')) > 1:
            multilines.extend(this_name.split(',')) 
        if len(this_name.split('and')) > 1:
            multilines.extend(this_name.split('and')) 
        if len(multilines) == 0:
            multilines.append(this_name)
            
        multilines = [scrub_name(name) for name in multilines]
        
        for name in set(multilines):
            
            if(len(re.sub('\s*', '', name)) > 0):
                #increment character line number for this specific play
                count = 1
                for i in range(0, len(entries['name'])):
                    if entries['name'][i] == name and entries['play'][i] == play_name:
                        count += 1
                entries['character_line_number'].append(count)
                
                entries['play_line_number'].append(len([1 for x in entries['play'] if x == play_name]) + 1)
                entries['name'].append(name)
                entries['play'].append(play_name)
                entries['line'].append(re.search(r'(?<=' + this_name + ')([\s\S]*)', line).group(0))
        
    return entries

In [28]:
entries = {'play': [], 'name': [], 'line': [], 'character_line_number': [], 'play_line_number': []}

rel_dir = '../data/texts'
play_files = os.listdir(rel_dir)

for play_file in play_files:
    if '.txt' in play_file:
        play = open_file(f'{rel_dir}/{play_file}')
        play = strip_opener(play)
        play = strip_epilogue(play)
        play = strip_stage_directions(play)
        play = strip_acts(play)
        play = strip_scenes(play)
        play = strip_dividers(play)
        
        play_name = re.search(r'[\S\s]*(?=(_TXT))', play_file).group(0)
        entries = append_play_lines(entries, play, play_name)

In [29]:
line_df = pd.DataFrame(entries)

In [30]:
line_df.shape

(31834, 5)

- Save data.

In [31]:
line_df.to_csv('../data/csv/ShakespeareCharacterLines.csv', index = False)

- To step 2 ->