In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import re

***Step 2: Data Cleaning***

In [2]:
line_df = pd.read_csv('../data/csv/ShakespeareCharacterLines.csv')

In [3]:
line_df.head()

Unnamed: 0,play,name,line,character_line_number,play_line_number
0,a-midsummer-nights-dream,THESEUS,"Now, fair Hippolyta, our nuptial hour",1,1
1,a-midsummer-nights-dream,THESEUS,Draws on apace. Four happy days bring in,2,2
2,a-midsummer-nights-dream,THESEUS,"Another moon. But, O, methinks how slow",3,3
3,a-midsummer-nights-dream,THESEUS,This old moon wanes! She lingers my desires,4,4
4,a-midsummer-nights-dream,THESEUS,Like to a stepdame or a dowager,5,5


In [4]:
line_df.shape

(120905, 5)

- Check for improperly processed lines.

In [5]:
line_df['line_length'] = line_df['line']
line_df['line_length'] = line_df['line_length'].map(len)

In [18]:
line_df[line_df['line_length'] <3]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,line,play_line_number,line_length
play,name,character_line_number,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
a-midsummer-nights-dream,FLUTE,12,O!,943,2
alls-well-that-ends-well,PAROLLES,262,O!,2055,2
antony-and-cleopatra,CLEOPATRA,359,O!,2347,2
coriolanus,CORIOLANUS,21,"O,",1112,2
coriolanus,VOLUMNIA,284,us,3718,2
cymbeline,IMOGEN,1,"O,",99,2
hamlet,HAMLET,1380,us,3870,2
king-john,BASTARD,211,be,852,2
macbeth,BANQUO,6,me,144,2
much-ado-about-nothing,HERO,77,me,1245,2


In [7]:
[name for name in list(line_df['name'].to_dict().values()) if ',' in name]

[]

In [8]:
[name for name in list(line_df['name'].to_dict().values()) if 'and' in name]

[]

In [9]:
[name for name in list(line_df['name'].to_dict().values()) if '\n' in name]

[]

- Remove all preceding commas, spaces, newlines and tabs, all inline newlines and tabs, and all ending newlines, tabs, and spaces.

In [10]:
def fix_tabs_newlines(string):
    #Turn multiple whitespace between two characters to a single whitespace.
    out = re.sub('(?<=\S)\s+(?=\S)', ' ', string)
    #Remove preceding whitespace
    out = re.sub('(?<=\A)\s+', '', out)
    #Remove ending whitespace
    out = re.sub('\s+(?=\Z)', '', out)
    #Remove preceding ',' plus whitespace.
    out = re.sub('\A,\s+(?=\S)', ' ', out)
    return out

In [11]:
line_df['line'] = line_df['line'].map(fix_tabs_newlines)

- Scrub stage directions from dialogue.

In [12]:
line_df['line'] = line_df['line'].map(lambda x: re.sub('__stage_direction__', ' ', x))

- Check for lines that are only stage directions, validating against the script.

In [13]:
line_df['line_length'] = line_df['line']
line_df['line_length'] = line_df['line_length'].map(len)

In [14]:
empty_indices = line_df[line_df['line_length'] < 2].index

- Drop empty line.

In [15]:
if empty_indices.size > 0:
    line_df.drop(index = empty_indices, inplace = True)

- Set multiindex.

In [16]:
line_df.set_index(["play", "name", "character_line_number"], inplace = True)

- Export data.

In [17]:
line_df.to_csv('../data/csv/ShakespeareCharacterLines_cleaned.csv', index_label = ['play', 'name', 'line_number'])

- On to Step 3 ->