In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import re

***Step 2: Data Cleaning***

In [2]:
line_df = pd.read_csv('../data/csv/ShakespeareCharacterLines.csv')

In [3]:
line_df.head()

Unnamed: 0,play,name,line,character_line_number,play_line_number
0,a-midsummer-nights-dream,THESEUS,"Now, fair Hippolyta, our nuptial hour\nDraws o...",1,1
1,a-midsummer-nights-dream,HIPPOLYTA,Four days will quickly steep themselves in nig...,1,2
2,a-midsummer-nights-dream,THESEUS,"Go, Philostrate,\nStir up the Athenian youth t...",2,3
3,a-midsummer-nights-dream,EGEUS,"Happy be Theseus, our renowned duke!\n\n",1,4
4,a-midsummer-nights-dream,THESEUS,"Thanks, good Egeus. What's the news with thee?...",3,5


In [4]:
line_df.shape

(31831, 5)

- Check for improperly processed lines.

In [5]:
line_df['line_length'] = line_df['line']
line_df['line_length'] = line_df['line_length'].map(len)

In [6]:
line_df[line_df['line_length'] <5]

Unnamed: 0,play,name,line,character_line_number,play_line_number,line_length
1099,alls-well-that-ends-well,PAROLLES,O!\n\n,98,595,4
2210,antony-and-cleopatra,CLEOPATRA,O!\n\n,117,770,4
24944,the-taming-of-the-shrew,PETER,I.\n\n,1,463,4


In [7]:
[name for name in list(line_df['name'].to_dict().values()) if ',' in name]

[]

In [8]:
[name for name in list(line_df['name'].to_dict().values()) if 'and' in name]

[]

In [9]:
[name for name in list(line_df['name'].to_dict().values()) if '\n' in name]

[]

- Remove all preceding commas, spaces, newlines and tabs, all inline newlines and tabs, and all ending newlines, tabs, and spaces.

In [10]:
def fix_tabs_newlines(string):
    #Turn multiple whitespace between two characters to a single whitespace.
    out = re.sub('(?<=\S)\s+(?=\S)', ' ', string)
    #Remove preceding whitespace
    out = re.sub('(?<=\A)\s+', '', out)
    #Remove ending whitespace
    out = re.sub('\s+(?=\Z)', '', out)
    #Remove preceding ',' plus whitespace.
    out = re.sub('\A,\s+(?=\S)', ' ', out)
    return out

In [11]:
line_df['line'] = line_df['line'].map(fix_tabs_newlines)

- Scrub stage directions from dialogue.

In [12]:
line_df['line'] = line_df['line'].map(lambda x: re.sub('__stage_direction__', ' ', x))

- Check for lines that are only stage directions, validating against the script.

In [13]:
line_df['line_length'] = line_df['line']
line_df['line_length'] = line_df['line_length'].map(len)

In [14]:
empty_indices = line_df[line_df['line_length'] < 2].index

- Drop empty line.

In [15]:
if empty_indices.size > 0:
    line_df.drop(index = empty_indices, inplace = True)

- Set multiindex.

In [16]:
line_df.set_index(["play", "name", "character_line_number"], inplace = True)

- Export data.

In [17]:
line_df.to_csv('../data/csv/ShakespeareCharacterLines_cleaned.csv', index_label = ['play', 'name', 'line_number'])

- On to Step 3 ->