In [1]:
import pickle
import pandas as pd
import re

### Cleaning the EXU data

A Critical Role mini-series hosted by Aabria

In [2]:
exu_df = pd.read_pickle('../pickle_jar/exu.pkl')
exu_df.head()

Unnamed: 0,name,text,ts.h,ts.m,ts.s,episode,title
0,AABRIA,Let's start with Robbie.,0,0,0,1,The Nameless Ones
1,ROBBIE,I'm so excited!,0,0,18,1,The Nameless Ones
2,AABRIA,Yeah!,0,0,24,1,The Nameless Ones
3,ROBBIE,Whew!,0,0,26,1,The Nameless Ones
4,AABRIA,Liam!,0,0,26,1,The Nameless Ones


In [3]:
exu_df = exu_df.rename(columns={'ts.h': 'ts_h', 'ts.m': 'ts_m', 'ts.s': 'ts_s'})

In [4]:
exu_df["ts_h"] = exu_df["ts_h"].astype(str)
exu_df["ts_h"] = exu_df["ts_h"].str.zfill(2)
exu_df["ts_m"] = exu_df["ts_m"].astype(str)
exu_df["ts_m"] = exu_df["ts_m"].str.zfill(2)
exu_df["ts_s"] = exu_df["ts_s"].astype(str)
exu_df["ts_s"] = exu_df["ts_s"].str.zfill(2)

In [5]:
exu_df['timestamp'] = exu_df[['ts_h', 'ts_m', 'ts_s']].agg(':'.join, axis=1)

In [6]:
exu_df = exu_df.drop(columns=['ts_h', 'ts_m', 'ts_s', 'title'])
exu_df.head()

Unnamed: 0,name,text,episode,timestamp
0,AABRIA,Let's start with Robbie.,1,00:00:00
1,ROBBIE,I'm so excited!,1,00:00:18
2,AABRIA,Yeah!,1,00:00:24
3,ROBBIE,Whew!,1,00:00:26
4,AABRIA,Liam!,1,00:00:26


Let's check if the EXU data has some of the same/familiar features that we saw while making cr_df. I'm going to be reusing as much of the regex from the cr_df processing as I possibly can.

In [7]:
exu_df[exu_df['text'].str.contains(r'♪')]

Unnamed: 0,name,text,episode,timestamp
1353,AIMEE,Yeah. She's just like-- ♪ Let the sun shine ♪ ...,1,02:15:58
2304,AIMEE,"♪ We got our minis, we got our minis ♪",1,03:11:18
2825,AABRIA,♪ Everybody! ♪,1,03:41:10
3112,LIAM,"It's (exhales) I thought the words, ""It's been...",1,03:57:38
3113,ALL,♪ It's been a while! ♪ (laughter),1,03:57:46
...,...,...,...,...
22079,ROBBIE,"♪ Ooh, come on ♪ ♪ Come on, Bless ♪ ♪ Please d...",8,01:18:24
22356,MATT,♪ I feel your pain ♪,8,01:37:06
22818,MATT,♪ I'm a liability ♪,8,02:27:40
22856,ASHLEY,♪ Private Eyes ♪,8,02:30:49


In [8]:
exu_df['text'] = exu_df['text'].str.replace(r'♪', r'', regex=True).astype('str')
exu_df.text[23184]

" It's been a while  I think I have some. "

In [9]:
exu_df[exu_df['text'].str.contains(r'.*\[inaudible\].*')]

Unnamed: 0,name,text,episode,timestamp
2876,AABRIA,I'm [inaudible] going to let you stick the lan...,1,03:43:43
4131,MATT,"It's [inaudible] great, but it feels different...",2,00:59:29
16419,MATT,The [inaudible] is strong.,6,02:50:56
17060,LIAM,They would've come after me on my [inaudible].,6,03:35:24
18561,MATT,"They go hard here in [inaudible], man!",7,01:07:56
19781,ASHLEY,[inaudible] d6 for this one.,7,02:54:43
20023,AABRIA,"No, [inaudible] you were getting at YMCA.",7,03:08:29


In [10]:
inaudible_df = pd.DataFrame(exu_df[exu_df['text'].str.contains(r'.*\[inaudible\].*')])

In [11]:
inaudible_df['text'] = inaudible_df['text'].str.replace(r'.*(\[inaudible\]).*', r'\1', regex=True).astype('str')
inaudible_df

Unnamed: 0,name,text,episode,timestamp
2876,AABRIA,[inaudible],1,03:43:43
4131,MATT,[inaudible],2,00:59:29
16419,MATT,[inaudible],6,02:50:56
17060,LIAM,[inaudible],6,03:35:24
18561,MATT,[inaudible],7,01:07:56
19781,ASHLEY,[inaudible],7,02:54:43
20023,AABRIA,[inaudible],7,03:08:29


In [12]:
inaudible = pd.Series(inaudible_df['text'])

In [13]:
exu_df['inaudible_speech'] = inaudible

In [14]:
exu_df[exu_df['text'].str.contains(r'\[.*\]')]

Unnamed: 0,name,text,episode,timestamp,inaudible_speech
2876,AABRIA,I'm [inaudible] going to let you stick the lan...,1,03:43:43,[inaudible]
4131,MATT,"It's [inaudible] great, but it feels different...",2,00:59:29,[inaudible]
10697,SAM,"Crew, what do you think? [cheerful jazzy music]",4,02:00:22,
10735,SAM,"Until then, be sure to hit that subscribe butt...",4,02:04:03,
16419,MATT,The [inaudible] is strong.,6,02:50:56,[inaudible]
17060,LIAM,They would've come after me on my [inaudible].,6,03:35:24,[inaudible]
18561,MATT,"They go hard here in [inaudible], man!",7,01:07:56,[inaudible]
19781,ASHLEY,[inaudible] d6 for this one.,7,02:54:43,[inaudible]
19915,MATT,(Thurston impression) [Inaudible] impact.,7,03:02:23,
20023,AABRIA,"No, [inaudible] you were getting at YMCA.",7,03:08:29,[inaudible]


In [15]:
exu_df['text'] = exu_df['text'].str.replace(r'\[.*\]', r'', regex=True).astype('str')

In [16]:
exu_df['text'] = exu_df['text'].str.replace(r'\(whispered( .*)', r'(whispered)\1', regex=True).astype('str')
exu_df.text[13830]

'(whispered) Oh god. '

In [17]:
exu_df[exu_df['text'].str.contains(r'\(')]

Unnamed: 0,name,text,episode,timestamp,inaudible_speech
6,ASHLEY,(laughter),1,00:00:37,
14,AABRIA,(laughter),1,00:01:06,
17,AIMEE,(cheering),1,00:01:14,
35,ASHLEY,"(laughter) You know, hemlock and oleander and ...",1,00:13:31,
41,ASHLEY,(gasps) Oh.,1,00:15:19,
...,...,...,...,...,...
23342,LIAM,(sighs),8,03:24:00,
23391,ROBBIE,(laughter),8,03:29:17,
23392,LIAM,"and the whole time, he's also just (sighs), wa...",8,03:29:19,
23395,AABRIA,Okay. (sighs) I didn't kill any of you and I w...,8,03:29:43,


In [18]:
nonspeech_df = pd.DataFrame(exu_df[exu_df['text'].str.contains('^\(.*\) $')])

In [19]:
nonspeech = pd.Series(nonspeech_df['text'])
nonspeech

6                         (laughter) 
14                        (laughter) 
17                        (cheering) 
44       (monkey screech) (laughter) 
70                        (laughter) 
                     ...             
23293                     (laughter) 
23297                     (laughter) 
23300               (happy exhaling) 
23342                        (sighs) 
23391                     (laughter) 
Name: text, Length: 772, dtype: object

In [20]:
exu_df['text'] = exu_df['text'].str.replace(r'^\(.*\) $', r'', regex=True).astype('str')

In [21]:
four_bracks = pd.DataFrame(exu_df[exu_df['text'].str.contains('^.*\(.*\).*\(.*\).* \(.*\).* \(.*\),*?.*$')])

In [22]:
four_bracks['text'] = four_bracks['text'].str.replace(r'^.*(\(.*\)).*(\(.*\)).* (\(.*\)).* (\(.*\)),*?.*$', r'\1 \2 \3 \4', regex=True).astype('str')
four_bracks

Unnamed: 0,name,text,episode,timestamp,inaudible_speech
8043,BEAU,(Shakäste shushes) (demonic laughing) (gruntin...,3,02:07:47,
18108,ESO,"(playing ""Hot Cross Buns"" off-key) (playing ""H...",7,00:02:13,


In [23]:
four = pd.Series(four_bracks['text'])
four

8043     (Shakäste shushes) (demonic laughing) (gruntin...
18108    (playing "Hot Cross Buns" off-key) (playing "H...
Name: text, dtype: object

In [24]:
exu_df['text'] = exu_df['text'].str.replace(r'(^.*)\(.*\)(.*)\(.*\)(.* )\(.*\)(.* )\(.*\)(,*?.*$)', r'\1 \2 \3 \4 \5', regex=True).astype('str')

In [25]:
three_bracks = pd.DataFrame(exu_df[exu_df['text'].str.contains('^.*\(.*\).* \(.*\).* "*\(.*\)"*,*?.*$')])

In [26]:
three_bracks['text'] = three_bracks['text'].str.replace(r'^.*(\(.*\)).* (\(.*\)).* "*(\(.*\))"*,*?.*$', r'\1 \2 \3', regex=True).astype('str')

In [27]:
three = pd.Series(three_bracks['text'])
three

97       (high-pitched) (clears throat) (normal voice)
16916                      (laughs) (barks) (laughter)
21692                   (yelling) (laughter) (yelling)
22286              (growls) (eating noises) (laughter)
23312              (hands thump) (laughter) (laughter)
Name: text, dtype: object

In [28]:
exu_df['text'] = exu_df['text'].str.replace(r'(^.*)\(.*\)(.* )\(.*\)(.* "*)\(.*\)("*,*?.*$)', r'\1 \2 \3 \4', regex=True).astype('str')

In [29]:
mult_bracks = pd.DataFrame(exu_df[exu_df['text'].str.contains('^.*\(.*\).*\(.*\),?.*$')])

In [30]:
mult_bracks['text'] = mult_bracks['text'].str.replace(r'^.*(\(.*\)).*(\(.*\)),?.*$', r'\1 \2', regex=True).astype('str')
mult_bracks

Unnamed: 0,name,text,episode,timestamp,inaudible_speech
131,AIMEE,(gasps) (loud whisper),1,00:28:25,
535,AIMEE,(yelling) (laughter),1,00:56:08,
1113,AABRIA,(monkey scream) (laughter),1,01:56:56,
1327,ASHLEY,(Cockney accent) (laughter),1,02:13:43,
1411,ASHLEY,(Cockney accent) (laughter),1,02:19:18,
1572,ROBBIE,(groans) (unhappily),1,02:28:19,
2869,AABRIA,(cheering) (cheering),1,03:43:24,
3077,ROBBIE,(clears throat) (queasy sound),1,03:55:02,
4246,AIMEE,(blows raspberry) (loud thump),2,01:06:26,
5027,MATT,(low-pitched scream) (laughter),2,02:29:39,


In [31]:
mult = pd.Series(mult_bracks['text'])
mult

131                (gasps) (loud whisper)
535                  (yelling) (laughter)
1113           (monkey scream) (laughter)
1327          (Cockney accent) (laughter)
1411          (Cockney accent) (laughter)
1572                 (groans) (unhappily)
2869                (cheering) (cheering)
3077       (clears throat) (queasy sound)
4246       (blows raspberry) (loud thump)
5027      (low-pitched scream) (laughter)
6757                  (laughs) (laughter)
10317               (cheering) (cheering)
10834                  (laughter) (sighs)
11874        (squelching fart) (laughter)
13100           (laughs) (creepy chuckle)
13430               (laughter) (laughter)
14102          (fire whooshing) (screams)
14786             (cheering) (soft music)
15268             (knocking) (whispering)
17259                    (sighs) (groans)
18230                    (snorts) (sighs)
18470                 (coughs) (laughter)
18781               (laughter) (laughter)
19240    (sharp inhale) (uncertain

In [32]:
exu_df['text'] = exu_df['text'].str.replace(r'(^.*)\(.*\)(.*)\(.*\)(,?.*$)', r'\1 \2 \3', regex=True).astype('str')

In [33]:
lead_bracks = pd.DataFrame(exu_df[exu_df['text'].str.contains('^\(.*\) ')])

In [34]:
lead_bracks = lead_bracks['text'].str.replace(r'(^\(.*\)) .*$', r'\1', regex=True).astype('str')

In [35]:
lead = pd.Series(lead_bracks)
lead

35                (laughter)
41                   (gasps)
58                  (unsure)
63                (laughter)
66             (pigeon coos)
                ...         
23137                (gasps)
23176    (uncertain whimper)
23270               (groans)
23272                (sighs)
23302       (happy exhaling)
Name: text, Length: 283, dtype: object

In [36]:
exu_df['text'] = exu_df['text'].str.replace(r'^\(.*\) ', '', regex=True).astype('str')

In [37]:
trail_bracks = pd.DataFrame(exu_df[exu_df['text'].str.contains('.* \(.*\)+ $')])

In [38]:
trail_bracks = trail_bracks['text'].str.replace(r'.* (\(.*\)+ $)', r'\1', regex=True).astype('str')
trail = pd.Series(trail_bracks)
trail

61       (laughter) 
232      (laughter) 
265        (laughs) 
339      (laughter) 
418      (laughter) 
            ...     
23014      (laughs) 
23071      (laughs) 
23074    (laughter) 
23245      (laughs) 
23404    (laughter) 
Name: text, Length: 313, dtype: object

In [39]:
exu_df['text'] = exu_df['text'].str.replace(r'\(.*\) $', '', regex=True).astype('str')

In [40]:
mid_bracks = pd.DataFrame(exu_df[exu_df['text'].str.contains(r'^.* \(.*\).*$')])

In [41]:
mid_bracks = mid_bracks['text'].str.replace(r'^.* (\(.*\)).*$', r'\1', regex=True).astype('str')
mid = pd.Series(mid_bracks)
mid

67           (laughs)
197         (screams)
750      (whispering)
1154       (shushing)
1180     (whispering)
             ...     
22463       (bellows)
22621      (laughter)
22737      (counting)
23000        (laughs)
23392         (sighs)
Name: text, Length: 154, dtype: object

In [42]:
exu_df['text'] = exu_df['text'].str.replace(r'(^.*) \(.*\)(.*$)', r'\1 \2', regex=True).astype('str')

In [43]:
quote_bracks = pd.DataFrame(exu_df[exu_df['text'].str.contains(r'.* "*\(.*\)"*.*$')])

In [44]:
quote_bracks['text'] = quote_bracks['text'].str.replace(r'.* "*(\(.*\))"*.*$', r'\1', regex=True).astype('str')
quotes = pd.Series(quote_bracks['text'])
quotes

5634         (snickering)
5856       (throat clear)
13094          (chuckles)
15930            (laughs)
16707             (yelps)
17378    (confused noise)
19555        (cold laugh)
Name: text, dtype: object

In [45]:
exu_df['text'] = exu_df['text'].str.replace(r'(.* "*)\(.*\)("*.*$)', r'\1 \2', regex=True).astype('str')

In [46]:
more_quotes = pd.DataFrame(exu_df[exu_df['text'].str.contains(r'"\(.*\)"')])

In [47]:
more_quotes['text'] = more_quotes['text'].str.replace(r'^.*"(\(.*\))"? .*$', r'\1', regex=True).astype('str')
more = pd.Series(more_quotes['text'])
more

4460      (frustrated screech)
4548                   (sighs)
4596    (laughs sarcastically)
7728                  (laughs)
7827         (sarcastic laugh)
Name: text, dtype: object

In [48]:
exu_df['text'] = exu_df['text'].str.replace(r'(^.*")\(.*\)("? .*$)', r'\1 \2', regex=True).astype('str')

In [49]:
odd_ones = pd.DataFrame(exu_df[exu_df['text'].str.contains(r'\(.*\)\.?"')])
odd_ones['text'] = odd_ones['text'].str.replace(r'(\(.*\))\.?".*', r'\1', regex=True).astype('str')
odd_ones

Unnamed: 0,name,text,episode,timestamp,inaudible_speech
4706,AABRIA,(coughing),2,02:03:33,


In [50]:
odds = pd.Series(odd_ones['text'])
odds

4706    (coughing)
Name: text, dtype: object

In [51]:
exu_df['text'] =exu_df['text'].str.replace(r'\(.*\)(\.?".*)', r'\1', regex=True).astype('str')

In [52]:
sounds = [nonspeech, four, three, mult, lead, trail, mid, quotes, more, odds]

In [53]:
noises = pd.concat(sounds)
noises

6                        (laughter) 
14                       (laughter) 
17                       (cheering) 
44      (monkey screech) (laughter) 
70                       (laughter) 
                    ...             
4548                         (sighs)
4596          (laughs sarcastically)
7728                        (laughs)
7827               (sarcastic laugh)
4706                      (coughing)
Name: text, Length: 1573, dtype: object

In [54]:
duplicates = noises.index[noises.index.duplicated(keep=False)]

In [55]:
print(duplicates) #no duplicates!

Int64Index([], dtype='int64')


In [56]:
exu_df['nonspeech'] = noises
exu_df.head()

Unnamed: 0,name,text,episode,timestamp,inaudible_speech,nonspeech
0,AABRIA,Let's start with Robbie.,1,00:00:00,,
1,ROBBIE,I'm so excited!,1,00:00:18,,
2,AABRIA,Yeah!,1,00:00:24,,
3,ROBBIE,Whew!,1,00:00:26,,
4,AABRIA,Liam!,1,00:00:26,,


In [57]:
exu_df['nonspeech'] = exu_df['nonspeech'].fillna('') #get rid of the ugly NaN values
exu_df['inaudible_speech'] = exu_df['inaudible_speech'].fillna('')

In [58]:
exu_df['name'].value_counts() #found a typo in one of the names and fixed it

AABRIA                    7630
AIMEE                     4175
MATT                      2810
ASHLEY                    2659
ROBBIE                    2488
                          ... 
CADUCEUS                     1
JESTER                       1
MARIUS                       1
ROBBIE, AABRIA, ASHLEY       1
ASHLEHY                      1
Name: name, Length: 77, dtype: int64

In [59]:
exu_df['name'] = exu_df['name'].str.replace(r'ASHLEHY', r'ASHLEY', regex=True).astype('str')

In [60]:
exu_df['season'] = 'EXU'

In [61]:
exu_df.head()

Unnamed: 0,name,text,episode,timestamp,inaudible_speech,nonspeech,season
0,AABRIA,Let's start with Robbie.,1,00:00:00,,,EXU
1,ROBBIE,I'm so excited!,1,00:00:18,,,EXU
2,AABRIA,Yeah!,1,00:00:24,,,EXU
3,ROBBIE,Whew!,1,00:00:26,,,EXU
4,AABRIA,Liam!,1,00:00:26,,,EXU


This is considered fully treated. It's going to have to undergo some changes in order to be combined with the D20 data, but it's good as it is for now. I'm going to save it as-is and cross the bridge when I get to it

In [62]:
#exu_df.to_pickle('../pickle_jar/EXU_split.pkl') 

### Cleaning the D20 data

Now, all of the other data I have for Aabria is from D20 and is in the same format, as we know from the last notebook where I built the dataframes for each season. Every df has variable, value (empty), episode, and season. I'm going to get them all in here, do some basic pre-treatment, concatenate them into one large D20 DF, and treat the full thing at once to get it all and split and ready to work on.

In [63]:
ACOFF = pd.read_pickle('../pickle_jar/ACOFF.pkl')
ACOFF = ACOFF.reset_index()
ACOFF

Unnamed: 0,index,variable,value,episode,season
0,0,Dimension 20 Season 14,,10,ACOFF
1,1,A Court of Fey & Flowers,,10,ACOFF
2,2,Starring: Aabria Iyengar as Game Master,,10,ACOFF
3,3,Surena Marie as BINX Choppley; Oscar Montoya a...,,10,ACOFF
4,4,Lou Wilson as Lord Squak Airavis; Emily Axford...,,10,ACOFF
...,...,...,...,...,...
23021,1986,Lou: Of course.,,9T,ACOFF
23022,1987,[Lou's lips smacking passionately],,9T,ACOFF
23023,1988,----------------------------------------------...,,9T,ACOFF
23024,1989,Captions extracted by: Kyber Bonsai,,9T,ACOFF


In [64]:
ACOFF.variable[0:11]

0                                Dimension 20 Season 14
1                              A Court of Fey & Flowers
2               Starring: Aabria Iyengar as Game Master
3     Surena Marie as BINX Choppley; Oscar Montoya a...
4     Lou Wilson as Lord Squak Airavis; Emily Axford...
5                                          Featherfowl;
6     Brennan Lee Mulligan as Captain K.P. Hob; and ...
7     Episode 10: You Will Never Know a Lonely Day A...
8                                  < [Previous Episode]
9                             [elegant classical music]
10    Aabria: To the churlish Captain K.P. Hob, vene...
Name: variable, dtype: object

In [65]:
meta = [0,1,2,3,4,5,6,7,8,9]

In [66]:
ACOFF=ACOFF.drop(meta)   #get rid of those starting credits - those are from the site header data
ACOFF.head()

Unnamed: 0,index,variable,value,episode,season
10,10,"Aabria: To the churlish Captain K.P. Hob, vene...",,10,ACOFF
11,11,"Goblin Court. To the master of ceremonies, Del...",,10,ACOFF
12,12,"the Court of Wonder. To BINX Choppley, sole su...",,10,ACOFF
13,13,"Craft. To the tenebrous Prince Andhera, scion ...",,10,ACOFF
14,14,"to the notorious Lords of the Wing, Lady Chirp...",,10,ACOFF


In [67]:
ACOFF[ACOFF['episode']=='10'][-5:]

Unnamed: 0,index,variable,value,episode,season
1313,1313,All: Hi-ya!,,10,ACOFF
1314,1314,Aabria: Bye.,,10,ACOFF
1315,1315,----------------------------------------------...,,10,ACOFF
1316,1316,Captions extracted by: Kyber Bonsai,,10,ACOFF
1317,1317,"Edited by: jooloo, OliverC, Iris (@sacredwhim)",,10,ACOFF


Now per the D20 data license agreement I have to list credit to the editors, so I"m saving this information for my licensing .md file but erasing it from the df since (again) it's not speech.

Captions extracted by: Kyber Bonsai<br>
Edited by: jooloo, OliverC, Iris (@sacredwhim)

rinse and repeat for the other 9 episodes. I'd find a better way to do this if there were more episodes, but since there are really so few I'll just scan one by one and mark down all of the indexes to get rid of in one go

In [68]:
credit = [1315,1316,1317]
ACOFF=ACOFF.drop(credit) 

first we have to fix those episode numbers

In [69]:
ACOFF[ACOFF['episode'].str.contains(r'\d\w')]

Unnamed: 0,index,variable,value,episode,season
10,10,"Aabria: To the churlish Captain K.P. Hob, vene...",,10,ACOFF
11,11,"Goblin Court. To the master of ceremonies, Del...",,10,ACOFF
12,12,"the Court of Wonder. To BINX Choppley, sole su...",,10,ACOFF
13,13,"Craft. To the tenebrous Prince Andhera, scion ...",,10,ACOFF
14,14,"to the notorious Lords of the Wing, Lady Chirp...",,10,ACOFF
...,...,...,...,...,...
23021,1986,Lou: Of course.,,9T,ACOFF
23022,1987,[Lou's lips smacking passionately],,9T,ACOFF
23023,1988,----------------------------------------------...,,9T,ACOFF
23024,1989,Captions extracted by: Kyber Bonsai,,9T,ACOFF


In [70]:
ACOFF['episode'] = ACOFF['episode'].str.replace(r'^(\d+)\w*$', r'\1', regex=True).astype('str')
ACOFF['episode'].value_counts()

3     2913
7     2855
2     2842
8     2542
5     2430
6     2368
9     1991
1     1896
4     1871
10    1305
Name: episode, dtype: int64

For the sake of efficiency I'm indexing to the start and end of each episde in the cell below rather than make 200 cells going 1 by 1. Compile all the start meta index values, all the end credit ones, and save all of the 

In [71]:
ACOFF[ACOFF['episode']=='9'][-5:]

Unnamed: 0,index,variable,value,episode,season
23021,1986,Lou: Of course.,,9,ACOFF
23022,1987,[Lou's lips smacking passionately],,9,ACOFF
23023,1988,----------------------------------------------...,,9,ACOFF
23024,1989,Captions extracted by: Kyber Bonsai,,9,ACOFF
23025,1990,"Edited by: OliverC, Iris (@sacredwhim)",,9,ACOFF


In [72]:
meta = [1318,1319,1320,1321,1322,1323,1324,1325,1326,1327,1328,3214,3215,3216,3217,3218,3219,3220,3221,3222,3223,3224,6056,6057,6058,6059,6060,6061,6062,6063,6064,6065,6066,8969,8970,8971,8972,8973,8974,8975,8976,8977,8978,8979,10840,10841,10842,10843,10844,10845,10846,10847,10848,10849,10850,13270,13271,13272,13273,13274,13275,13276,13277,13278,13279,15638,15639,15640,15641,15642,15643,15644,15645,15646,15647,18493,18494,18495,18496,18497,18498,18499,18500,18501,18502,21035,21036,21037,21038,21039,21040,21041,21041,21042,21043,21044]

In [73]:
credit = [1988,1989,1990,3211,3212,3213,6053,6054,6055,8966,8967,8968,10837,10838,10839,13267,13268,13269,15635,15636,15637,18490,18491,18492,21032,21033,21034,23023,23024,23025]

In [74]:
ACOFF=ACOFF.drop(meta) 

In [75]:
ACOFF=ACOFF.drop(credit) 

Captions extracted by: gluegunshots
Tish (Tish#3276), Iris (@sacredwhim)

Captions extracted by: Kyber Bonsai
Edited by: OliverC

krsp

In [76]:
ACOFF

Unnamed: 0,index,variable,value,episode,season
10,10,"Aabria: To the churlish Captain K.P. Hob, vene...",,10,ACOFF
11,11,"Goblin Court. To the master of ceremonies, Del...",,10,ACOFF
12,12,"the Court of Wonder. To BINX Choppley, sole su...",,10,ACOFF
13,13,"Craft. To the tenebrous Prince Andhera, scion ...",,10,ACOFF
14,14,"to the notorious Lords of the Wing, Lady Chirp...",,10,ACOFF
...,...,...,...,...,...
23018,1983,us. Are you ready?,,9,ACOFF
23019,1984,"Squak: Yeah, yeah.",,9,ACOFF
23020,1985,Emily: Can I hear the sound effects that we hear?,,9,ACOFF
23021,1986,Lou: Of course.,,9,ACOFF


We're in a good place! Now just do exactly that again with all of the other seasons of the show

In [77]:
ACOFF_AP = pd.read_pickle('../pickle_jar/ACOFF_AP.pkl')
ACOFF_AP = ACOFF_AP.reset_index()
ACOFF_AP

Unnamed: 0,index,variable,value,episode,season
0,0,Dimension 20,,10,ACOFF AP
1,1,Adventuring Party,,10,ACOFF AP
2,2,,,10,ACOFF AP
3,3,Schrödinger’s Canon,,10,ACOFF AP
4,4,Season 10 Episode 10,,10,ACOFF AP
...,...,...,...,...,...
6941,519,Surena: Bye!,,9D,ACOFF AP
6942,520,Brennan: Bye!,,9D,ACOFF AP
6943,521,----------------------------------------------...,,9D,ACOFF AP
6944,522,Captions extracted by: OliverC,,9D,ACOFF AP


In [78]:
ACOFF_AP['episode'] = ACOFF_AP['episode'].str.replace(r'^(\d+)\w*$', r'\1', regex=True).astype('str')
ACOFF_AP['episode'].value_counts()

11    836
10    814
1     807
3     678
7     627
8     605
2     591
6     531
9     524
4     482
5     451
Name: episode, dtype: int64

In [79]:
ACOFF_AP[ACOFF_AP['episode']=='9'][-5:]

Unnamed: 0,index,variable,value,episode,season
6941,519,Surena: Bye!,,9,ACOFF AP
6942,520,Brennan: Bye!,,9,ACOFF AP
6943,521,----------------------------------------------...,,9,ACOFF AP
6944,522,Captions extracted by: OliverC,,9,ACOFF AP
6945,523,Edited by: OliverC,,9,ACOFF AP


In [80]:
meta2 = [0,1,2,3,4,5,814,815,816,817,818,819,1650,1651,1652,1653,1654,1655,2457,2458,2459,2460,2461,2462,3048,3049,3050,3051,3052,3052,3053,3726,3727,3728,3729,3730,3731,3726,3727,3728,3729,3730,3731,4208,4209,4210,4211,4212,4213,4659,4660,4661,4662,4663,4664,5190,5191,5192,5193,5194,5195,5817,5818,5819,5820,5821,5822,6422,6423,6424,6425,6426,6427]

In [81]:
credit2=[811,812,813,2454,2455,2456,3045,3046,3047,3723,3724,3725,4205,4206,4207,4656,4657,4658,5188,5189,5814,5815,5816,6419,6420,6421,6943,6944,6945]

In [82]:
ACOFF_AP=ACOFF_AP.drop(meta2) 

In [83]:
ACOFF_AP=ACOFF_AP.drop(credit2) 

In [84]:
ACOFF_AP

Unnamed: 0,index,variable,value,episode,season
6,6,"Aabria: Hello, my pack of pixies, and welcome ...",,10,ACOFF AP
7,7,Party for A Court of Fey & Flowers. We did it....,,10,ACOFF AP
8,8,Surena: We're ghosts. All of us are ghosts.,,10,ACOFF AP
9,9,Oscar: We're all ghosts.,,10,ACOFF AP
10,10,"Aabria: Everyone's dead, and everyone's a ghos...",,10,ACOFF AP
...,...,...,...,...,...
6938,516,for the finale.,,9,ACOFF AP
6939,517,(group shouts),,9,ACOFF AP
6940,518,Aabria: Bye!,,9,ACOFF AP
6941,519,Surena: Bye!,,9,ACOFF AP


Captions extracted by: OliverC<br>
Edited by: OliverC

In [85]:
burrow = pd.read_pickle('../pickle_jar/burrow.pkl')
burrow=burrow.reset_index()
burrow

Unnamed: 0,index,variable,value,episode,season
0,0,Dimension 20 Season 20,,10,Burrow's End
1,1,Burrow’s End,,10,Burrow's End
2,2,StarrinAva: Aabria Iyengar as Game Master,,10,Burrow's End
3,3,Brennan Lee Mulligan as Tula; Erika Ishii as Ava;,,10,Burrow's End
4,4,Siobhan Thompson as Jaysohn; Rashawn Nadine Sc...,,10,Burrow's End
...,...,...,...,...,...
24356,2406,"Ava: Well, help us understand, then.",,9H,Burrow's End
24357,2407,Stoat: Understand this!,,9H,Burrow's End
24358,2408,Viola: Babe!,,9H,Burrow's End
24359,2409,"Thorn: It's been a pleasure, everyone. Been a ...",,9H,Burrow's End


In [86]:
burrow['episode'] = burrow['episode'].str.replace(r'^(\d+)\w*$', r'\1', regex=True).astype('str')
burrow['episode'].value_counts()

6     3401
5     3028
2     2614
10    2425
9     2411
3     2326
4     2266
1     2117
8     1902
7     1871
Name: episode, dtype: int64

In [87]:
burrow[burrow['episode']=='9'][-5:]

Unnamed: 0,index,variable,value,episode,season
24356,2406,"Ava: Well, help us understand, then.",,9,Burrow's End
24357,2407,Stoat: Understand this!,,9,Burrow's End
24358,2408,Viola: Babe!,,9,Burrow's End
24359,2409,"Thorn: It's been a pleasure, everyone. Been a ...",,9,Burrow's End
24360,2410,(somber music continues),,9,Burrow's End


In [88]:
meta3 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,2425,2426,2427,2428,2429,2430,2431,2432,2433,2434,2435,2436,2437,2438,2439,2440,2441,2442,4542,4543,4544,4545,4546,4547,4548,4549,4550,4551,7156,7157,7158,7159,7160,7161,7162,7163,7164,7165,7166,7167,7168,7169,9482,9483,9484,9485,9486,9487,9488,9489,9490,11748,11749,11750,11751,11752,11753,11754,11755,11756,11757,11758,11759,11760,11761,11762,14776,14777,14778,14779,14780,14781,14782,14783,14784,14785,14786,14787,14788,14789,14790,18177,18178,18179,18180,18181,18182,18183,18184,18185,18186,18187,18188,18189,18190,18191,20048,20049,20050,20051,20052,20053,20054,20055,20056,20057,20058,20059,20060,20061,20062,21950,21951,21952,21953,21954,21955,21956,21957,21958,21959,21960,21961,21962,21963]

In [89]:
credit3 = [4539,4540,4541,7153,7154,7155,9479,9480,9481,11745,11746,11747,14773,14774,14775,18174,18175,18176,20047,21949,24360]

In [90]:
burrow = burrow.drop(meta3)

In [91]:
burrow=burrow.drop(credit3)

In [92]:
burrow

Unnamed: 0,index,variable,value,episode,season
32,32,Aabria: Y'all ready to play some fucking D&D? ...,,10,Burrow's End
33,33,Hello friends and welcome to the finale of Bur...,,10,Burrow's End
34,34,"joint, it got a little weird already.",,10,Burrow's End
35,35,Rashawn: Yes.,,10,Burrow's End
36,36,Aabria: I am your warren warden and Dungeon Ma...,,10,Burrow's End
...,...,...,...,...,...
24355,2405,Stoat: I don't mind being misunderstood.,,9,Burrow's End
24356,2406,"Ava: Well, help us understand, then.",,9,Burrow's End
24357,2407,Stoat: Understand this!,,9,Burrow's End
24358,2408,Viola: Babe!,,9,Burrow's End


Captions extracted by: Rachel S.
Edited by: Claire B., OliverC, gluegunshots 	

Mer

In [93]:
burrowAP = pd.read_pickle('../pickle_jar/burrowAP.pkl')
burrowAP=burrowAP.reset_index()
burrowAP

Unnamed: 0,index,variable,value,episode,season
0,0,Dimension 20,,1S,Burrow's End AP
1,1,Adventuring Party,,1S,Burrow's End AP
2,2,,,1S,Burrow's End AP
3,3,Stoatal Recall,,1S,Burrow's End AP
4,4,Season 15 Episode 1,,1S,Burrow's End AP
...,...,...,...,...,...
3155,743,Siobhan: Bye.,,5B,Burrow's End AP
3156,744,"Erika: Yum, yum, yum.",,5B,Burrow's End AP
3157,745,----------------------------------------------...,,5B,Burrow's End AP
3158,746,Captions extracted by: OliverC,,5B,Burrow's End AP


In [94]:
burrowAP['episode'] = burrowAP['episode'].str.replace(r'^(\d+)\w*$', r'\1', regex=True).astype('str')
burrowAP['episode'].value_counts()

5    748
3    711
4    602
1    570
2    529
Name: episode, dtype: int64

In [95]:
burrowAP[burrowAP['episode']=='5'][-5:]

Unnamed: 0,index,variable,value,episode,season
3155,743,Siobhan: Bye.,,5,Burrow's End AP
3156,744,"Erika: Yum, yum, yum.",,5,Burrow's End AP
3157,745,----------------------------------------------...,,5,Burrow's End AP
3158,746,Captions extracted by: OliverC,,5,Burrow's End AP
3159,747,"Edited by: OliverC, Mer",,5,Burrow's End AP


In [96]:
meta4 = [0,1,2,3,4,5,570,571,572,573,574,575,1099,1100,1101,1102,1103,1104,1810,1811,1812,1813,1814,1815,2412,2413,2414,2415,2416,2417]

In [97]:
credit4 = [567,568,569,1096,1097,1098,1807,1808,1809,2409,2410,2411,3157,3158,3159]

In [98]:
burrowAP = burrowAP.drop(meta4)

In [99]:
burrowAP = burrowAP.drop(credit4)

In [100]:
burrowAP

Unnamed: 0,index,variable,value,episode,season
6,6,Aabria: Hello and welcome to the first episode...,,1,Burrow's End AP
7,7,reaching for?,,1,Burrow's End AP
8,8,Izzy: There are little things.,,1,Burrow's End AP
9,9,Aabria: Don't worry about that yet. Don't worr...,,1,Burrow's End AP
10,10,"Siobhan: What, there's clues, there's clues! T...",,1,Burrow's End AP
...,...,...,...,...,...
3152,740,thumping],,5,Burrow's End AP
3153,741,Brennan: Ah!,,5,Burrow's End AP
3154,742,Jasper: That was so good.,,5,Burrow's End AP
3155,743,Siobhan: Bye.,,5,Burrow's End AP


In [101]:
mismag = pd.read_pickle('../pickle_jar/mismag.pkl')
mismag=mismag.reset_index()
mismag

Unnamed: 0,index,variable,value,episode,season
0,0,Dimension 20,,1T,Misfits and Magic
1,1,Misfits and Magic,,1T,Misfits and Magic
2,2,Starring: Aabria Iyengar as Game Master,,1T,Misfits and Magic
3,3,Erika Ishii as Dream;,,1T,Misfits and Magic
4,4,Danielle Radford as Sam Black;,,1T,Misfits and Magic
...,...,...,...,...,...
14737,3345,our wonderful holiday special. Thank you very ...,,Ma,Misfits and Magic
14738,3346,Lulling everybody.,,Ma,Misfits and Magic
14739,3347,All: Happy Lulling.,,Ma,Misfits and Magic
14740,3348,[a]no idea what's being said here around 1:10:20,,Ma,Misfits and Magic


This series has 4 episodes and one holiday special episode, in here encoded as "Ma". I'm going to recode that as episode 5 for the sake of processing

In [102]:
mismag['episode'] = mismag['episode'].str.replace(r'Ma', r'5', regex=True).astype('str')

In [103]:
mismag['episode'] = mismag['episode'].str.replace(r'^(\d+)\w*$', r'\1', regex=True).astype('str')
mismag['episode'].value_counts() #this is a full season, it's just very short'

5    3350
4    3261
1    2899
2    2667
3    2565
Name: episode, dtype: int64

In [104]:
mismag[mismag['episode']=='5'][-5:]

Unnamed: 0,index,variable,value,episode,season
14737,3345,our wonderful holiday special. Thank you very ...,,5,Misfits and Magic
14738,3346,Lulling everybody.,,5,Misfits and Magic
14739,3347,All: Happy Lulling.,,5,Misfits and Magic
14740,3348,[a]no idea what's being said here around 1:10:20,,5,Misfits and Magic
14741,3349,"[b]wasn't sure what is being said here, around...",,5,Misfits and Magic


In [105]:
meta5 = [0,1,2,3,4,5,6,7,8,2899,2900,2901,2902,2903,2904,2905,2906,2907,2908,2909,2910,2911,2912,5566,5567,5568,5569,5570,5571,5572,5573,5574,5575,5576,5577,5578,5579,5580,8131,8132,8133,8134,8135,8136,8137,8138,8139,8140,8141,8142,11392,11393,11394,11395,11396,11397,11398,11399,11400]

In [106]:
credit5 = [2896,2897,2898,5563,5564,5565,8129,8130,11390,11391,14740,14741]

In [107]:
mismag = mismag.drop(meta5)

In [108]:
mismag=mismag.drop(credit5)

In [109]:
mismag

Unnamed: 0,index,variable,value,episode,season
9,9,"Aabria: Hello, friends, and welcome. We're her...",,1,Misfits and Magic
10,10,"a new game called ""Misfits and Magic""! [cheeri...",,1,Misfits and Magic
11,11,Cast cheers and laughs.,,1,Misfits and Magic
12,12,"Aabria: I am your game master, Aabria Iyengar,...",,1,Misfits and Magic
13,13,"Magical Misfits. Say hi, Magical Misfits!",,1,Misfits and Magic
...,...,...,...,...,...
14735,3343,"you threw for yourselves, that you threw for y...",,5,Misfits and Magic
14736,3344,full swing.That is where are we going to leave...,,5,Misfits and Magic
14737,3345,our wonderful holiday special. Thank you very ...,,5,Misfits and Magic
14738,3346,Lulling everybody.,,5,Misfits and Magic


Edited by: Kyber Bonsai, tevildo, gluegunshots, solsys
solsys, Tillie the Paladin, gluegunshots, kindlestuck
kindlestuck, solsys, Ethan Belanger

In [110]:
mismagAP = pd.read_pickle('../pickle_jar/mismagAP.pkl')
mismagAP=mismagAP.reset_index()
mismagAP

Unnamed: 0,index,variable,value,episode,season
0,0,Dimension 20,,1C,Misfits and Magic AP
1,1,Adventuring Party,,1C,Misfits and Magic AP
2,2,Cluck if You Buck,,1C,Misfits and Magic AP
3,3,Season 5 Episode 1,,1C,Misfits and Magic AP
4,4,[Next Episode] >,,1C,Misfits and Magic AP
...,...,...,...,...,...
4347,1247,"it right one time, thank God.",,5E,Misfits and Magic AP
4348,1248,"With all of you. So that's it, cut the feed!",,5E,Misfits and Magic AP
4349,1249,----------------------------------------------...,,5E,Misfits and Magic AP
4350,1250,Captions extracted by: OliverC,,5E,Misfits and Magic AP


In [111]:
mismagAP['episode'] = mismagAP['episode'].str.replace(r'^(\d+)\w*$', r'\1', regex=True).astype('str')
mismagAP['episode'].value_counts()

5    1252
3     994
2     759
1     713
4     634
Name: episode, dtype: int64

In [112]:
mismagAP[mismagAP['episode']=='5'][-5:]

Unnamed: 0,index,variable,value,episode,season
4347,1247,"it right one time, thank God.",,5,Misfits and Magic AP
4348,1248,"With all of you. So that's it, cut the feed!",,5,Misfits and Magic AP
4349,1249,----------------------------------------------...,,5,Misfits and Magic AP
4350,1250,Captions extracted by: OliverC,,5,Misfits and Magic AP
4351,1251,Edited by: OliverC,,5,Misfits and Magic AP


In [113]:
meta6 = [0,1,2,3,4,713,714,715,716,717,1472,1473,1474,1475,1476,2466,2467,2468,2469,2470,3100,3101,3102,3103,3104]

In [114]:
credit6 = [710,711,712,1469,1470,1471,2463,2464,2465,3097,3098,3099,4349,4350,4351]

In [115]:
mismagAP=mismagAP.drop(meta6)

In [116]:
mismagAP=mismagAP.drop(credit6)

In [117]:
mismagAP

Unnamed: 0,index,variable,value,episode,season
5,5,"Aabria: Hello, and welcome to the first episod...",,1,Misfits and Magic AP
6,6,"for ""Misfits and Magic"". We're gonna chat a li...",,1,Misfits and Magic AP
7,7,"one. Just, hey y'all, how's everyone doing? Le...",,1,Misfits and Magic AP
8,8,doing? We okay?,,1,Misfits and Magic AP
9,9,Brennan: Whoo!,,1,Misfits and Magic AP
...,...,...,...,...,...
4344,1244,your hard work. This is literally what we mean...,,5,Misfits and Magic AP
4345,1245,"collaborative storytelling, We all built this ...",,5,Misfits and Magic AP
4346,1246,"an honor and a privilege to get to build ""Misf...",,5,Misfits and Magic AP
4347,1247,"it right one time, thank God.",,5,Misfits and Magic AP


#### Misfits & Magic S2 

is not completed but I will use it later if I can

In [118]:
mismag2 = pd.read_pickle('../pickle_jar/mismag2.pkl')
mismag2=mismag2.reset_index()
mismag

Unnamed: 0,index,variable,value,episode,season
9,9,"Aabria: Hello, friends, and welcome. We're her...",,1,Misfits and Magic
10,10,"a new game called ""Misfits and Magic""! [cheeri...",,1,Misfits and Magic
11,11,Cast cheers and laughs.,,1,Misfits and Magic
12,12,"Aabria: I am your game master, Aabria Iyengar,...",,1,Misfits and Magic
13,13,"Magical Misfits. Say hi, Magical Misfits!",,1,Misfits and Magic
...,...,...,...,...,...
14735,3343,"you threw for yourselves, that you threw for y...",,5,Misfits and Magic
14736,3344,full swing.That is where are we going to leave...,,5,Misfits and Magic
14737,3345,our wonderful holiday special. Thank you very ...,,5,Misfits and Magic
14738,3346,Lulling everybody.,,5,Misfits and Magic


### Concatenating and Splitting

back into the regex mines I go!

In [119]:
d20 = [ACOFF, ACOFF_AP, burrow, burrowAP, mismag, mismagAP]

In [120]:
d20_df = pd.concat(d20)

In [121]:
d20_df = d20_df.reset_index()

In [122]:
d20_df = d20_df.drop(['level_0', 'value'], axis=1)
d20_df

Unnamed: 0,index,variable,episode,season
0,10,"Aabria: To the churlish Captain K.P. Hob, vene...",10,ACOFF
1,11,"Goblin Court. To the master of ceremonies, Del...",10,ACOFF
2,12,"the Court of Wonder. To BINX Choppley, sole su...",10,ACOFF
3,13,"Craft. To the tenebrous Prince Andhera, scion ...",10,ACOFF
4,14,"to the notorious Lords of the Wing, Lady Chirp...",10,ACOFF
...,...,...,...,...
76015,1244,your hard work. This is literally what we mean...,5,Misfits and Magic AP
76016,1245,"collaborative storytelling, We all built this ...",5,Misfits and Magic AP
76017,1246,"an honor and a privilege to get to build ""Misf...",5,Misfits and Magic AP
76018,1247,"it right one time, thank God.",5,Misfits and Magic AP


In [123]:
print(d20_df.variable[0])
print(d20_df.variable[1])
print(d20_df.variable[2])
print(d20_df.variable[3])
print(d20_df.variable[4])
print(d20_df.variable[5])
print(d20_df.variable[6])
print(d20_df.variable[7])
print(d20_df.variable[8])
print(d20_df.variable[9])
print(d20_df.variable[10])

Aabria: To the churlish Captain K.P. Hob, venerated soldier of the
Goblin Court. To the master of ceremonies, Delloso de la Rue, pride of
the Court of Wonder. To BINX Choppley, sole survivor of the Court of
Craft. To the tenebrous Prince Andhera, scion of the Unseelie Court. And
to the notorious Lords of the Wing, Lady Chirp Featherfowl, Countess of
Cluckingham, and Lord Squak Airavis, Earl of Peckersburg. Honored
archfey, we are delighted to welcome you to A Court of Fey & Flowers.
Hello, and welcome to Dimension 20's A Court of Fey & Flowers, the
thrilling finale. I am your Dungeon Master, Aabria Iyengar, and with me,
one last time, are my pack of pixies. Say hi, pack of pixies!
All: Hi, pack of pixies!


A quirk of the website format is that when someone has a long speech/several lines of dialogue it's tagged with their name at the very first line but none of the trailing ones. So you know when the next line starts when the next tagged name begins. It looks like I may be able to fix this though with the .ffill feature from pandas... let's separate out the names we do have, make them a series, and add them to a new column and see where that gets us.

This won't be perfect... because there are lines of text that are not speech and will be moved to the nonspeech columns, but some of THOSE unlike in cr_df are descriptive lines about the room itself. (see line 5225 below). 

In [124]:
d20_df.variable[5225]

'Dome changes from green to blue.'

I found player names in brackets like this: Izzy: \[Izzy\], a holdover from the more recently completed transcriptions (You can tell because they're all only found in Burrow's End and Burrows End AP) that must have been marked as temporary before being confirmed. I'm going to get rid of those as I find them.

In [125]:
d20_df[d20_df['variable'].str.contains('\[Izzy\]')]

Unnamed: 0,index,variable,episode,season
29754,47,Izzy: [Izzy] Bookends are not circles. (Siobha...,10,Burrow's End
29784,77,Izzy: [Izzy] Did Teedles-?,10,Burrow's End
29790,83,Izzy: [Izzy] Call.,10,Burrow's End
29969,262,"Lila: [Izzy] ""Meloquence""? (Players intentiona...",10,Burrow's End
30064,357,Izzy: [Izzy] Okay.,10,Burrow's End
...,...,...,...,...
54570,93,Izzy: [Izzy] Not for nothing.,2,Burrow's End AP
54666,189,Izzy: [Izzy] Dowels?,2,Burrow's End AP
54807,330,[Siobhan] reveled in violence. You [Izzy] lear...,2,Burrow's End AP
54881,404,Izzy: [Izzy] She's long dead.,2,Burrow's End AP


In [126]:
d20_df['variable'] = d20_df['variable'].str.replace(r'\[Izzy\]', r'', regex=True).astype('str')

In [127]:
d20_df.variable[29754]

'Izzy:  Bookends are not circles. (Siobhan laughs)'

In [128]:
#d20_df[d20_df['variable'].str.contains('\[Aabria\]')]

In [129]:
d20_df['variable'] = d20_df['variable'].str.replace(r'\[Aabria\]', r'', regex=True).astype('str')

In [130]:
#d20_df[d20_df['variable'].str.contains('\[Rashawn\]')]

In [131]:
d20_df['variable'] = d20_df['variable'].str.replace(r'\[Rashawn\]', r'', regex=True).astype('str')

In [132]:
#d20_df[d20_df['variable'].str.contains('\[Erika\]')]

In [133]:
d20_df['variable'] = d20_df['variable'].str.replace(r'\[Erika\]', r'', regex=True).astype('str')

In [134]:
#d20_df[d20_df['variable'].str.contains('\[Jasper\]')]

In [135]:
d20_df['variable'] = d20_df['variable'].str.replace(r'\[Jasper\]', r'', regex=True).astype('str')

In [136]:
#d20_df[d20_df['variable'].str.contains('\[Siobhan\]')]

In [137]:
d20_df['variable'] = d20_df['variable'].str.replace(r'\[Siobhan\]', r'', regex=True).astype('str')

In [138]:
#d20_df[d20_df['variable'].str.contains('\[Brennan\]')]

In [139]:
d20_df['variable'] = d20_df['variable'].str.replace(r'\[Brennan\]', r'', regex=True).astype('str')

In [140]:
d20_df['variable'] = d20_df['variable'].str.replace(r'\[Cast\]', r'', regex=True).astype('str')

In [141]:
#d20_df['variable'] = d20_df['variable'].str.replace(r'AlLila', r'All', regex=True).astype('str')

In [142]:
d20_df['variable'] = d20_df['variable'].str.replace(r'\[All\]', r'', regex=True).astype('str')

In [143]:
d20_df['variable'] = d20_df['variable'].str.replace(r'Airry]', r'Airry', regex=True).astype('str')

In [144]:
d20_df['variable'] = d20_df['variable'].str.replace(r'(.*)\^\(\[a\]\)', r'\1', regex=True).astype('str')

In [145]:
d20_df['variable'] = d20_df['variable'].str.replace(r'(.*)\^\(\[b\]\)', r'\1', regex=True).astype('str')

In [146]:
d20_df['variable'] = d20_df['variable'].str.replace(r'Tula, internally: When I was walking back, \(somber music\) it was the day', r'Tula: (internally) When I was walking back, (somber music) it was the day', regex=True).astype('str')

In [147]:
odd_r = pd.DataFrame(d20_df[d20_df['variable'].str.contains('.*\(.*\).*\(.*\).*\(.*\).*\(.*$')])

In [148]:
odd_r['variable'] = odd_r['variable'].str.replace(r'.*(\(.*\)).*(\(.*\)).*(\(.*\)).*(\(.*)$', r'\1\2\3\4', regex=True).astype('str')
odd_r = pd.Series(odd_r['variable'])
odd_r

34932    (Erika laughs)(slow electronic music)(bear roa...
74154    (all cheering)(Lou singing)(token clanking)(al...
Name: variable, dtype: object

In [149]:
d20_df['variable'] = d20_df['variable'].str.replace(r'(.*)\(.*\)(.*)\(.*\)(.*)\(.*\)(.*)\(.*$', r'\1\2\3\4', regex=True).astype('str')

In [150]:
d20_df[d20_df['variable'].str.contains('.*\(.*\).*\(.*\).*\(.*$')]

Unnamed: 0,index,variable,episode,season
30038,331,Rashawn: (Spanish accent) Radio Soul (group la...,10,Burrow's End
30233,526,Dr. Tara: (impressed) Fuck. (Brennan chortles)...,10,Burrow's End
34540,322,(group laughs) (Jasper sighs) (group laughs),2,Burrow's End
34812,594,(slow music) (Izzy claps) (Chimes for a natura...,2,Burrow's End
34902,684,Brennan: (breathy) Viola. (Brennan groans brea...,2,Burrow's End
34906,688,dart. You're like- (slap) (groans) (group laughs),2,Burrow's End
35343,1125,(slow music) (heart beats) (group laughs as Iz...,2,Burrow's End
53478,1966,"Viola: No, that's MeatWolf! (Erika screams) (A...",9,Burrow's End


In [151]:
d20_df[d20_df['variable'].str.contains('.*\[.*\].*\(.*\).*\(.*\).*')]

Unnamed: 0,index,variable,episode,season
62836,277,Evan: [demonic voice] One (growls) two! (screams),3,Misfits and Magic


In [152]:
small_mix = pd.DataFrame(d20_df[d20_df['variable'].str.contains('.*\[.*\].*\(.*\).*\(.*\).*')])

In [153]:
small_mix['variable'] = small_mix['variable'].str.replace(r'.*(\[.*\]).*(\(.*\)).*(\(.*\)).*', r'\1\2\3', regex=True).astype('str')

In [154]:
small = pd.Series(small_mix['variable'])
small

62836    [demonic voice](growls)(screams)
Name: variable, dtype: object

In [155]:
d20_df['variable'] = d20_df['variable'].str.replace(r'(.*)\[.*\](.*)\(.*\)(.*)\(.*\)(.*)', r'\1\2\3\4', regex=True).astype('str')

In [156]:
#d20_df[d20_df['variable'].str.contains('.*\[.*\].*\(.*\).*')]

In [157]:
mix_bracks = pd.DataFrame(d20_df[d20_df['variable'].str.contains('.*\[.*\].*\(.*\).*')])

In [158]:
mix_bracks['variable'] = mix_bracks['variable'].str.replace(r'.*(\[.*\]).*(\(.*\)).*', r'\1\2', regex=True).astype('str')
mix_b = pd.Series(mix_bracks['variable'])
mix_b

55192    [mimics holding a sniper](cast laughing)
55390                    [singing](cast laughing)
59984             [nervously, stuttering](laughs)
64511                         [high voice](sighs)
66800           [claps](Cinderella-esque singing)
66945                         [whispered](laughs)
Name: variable, dtype: object

In [159]:
d20_df['variable'] = d20_df['variable'].str.replace(r'(.*)\[.*\](.*)\(.*\)(.*)', r'\1\2\3', regex=True).astype('str')

In [160]:
d20_df[d20_df['variable'].str.contains('.*\(.*\).*\[.*\].*')]

Unnamed: 0,index,variable,episode,season
67899,2789,Lou: (laughs) God. [shivers and hugs himself],4,Misfits and Magic


In [161]:
rsq_bracks = pd.DataFrame(d20_df[d20_df['variable'].str.contains('.*\(.*\).*\[.*\].*')])

In [162]:
rsq_bracks['variable'] = rsq_bracks['variable'].str.replace(r'.*(\(.*\)).*(\[.*\]).*', r'\1\2', regex=True).astype('str')
rsq = pd.Series(rsq_bracks['variable'])
rsq

67899    (laughs)[shivers and hugs himself]
Name: variable, dtype: object

In [163]:
d20_df['variable'] = d20_df['variable'].str.replace(r'(.*)\(.*\)(.*)\[.*\].*', r'\1\2', regex=True).astype('str')

In [164]:
#d20_df[d20_df['variable'].str.contains('^\[.*\]$')]

In [165]:
sq_bracks = pd.DataFrame(d20_df[d20_df['variable'].str.contains('^\[.*\]$')])

In [166]:
sq_bracks['variable'] = sq_bracks['variable'].str.replace(r'^(\[.*\])$', r'\1', regex=True).astype('str')
sq = pd.Series(sq_bracks['variable'])
sq

15             [flashback from episode 9]
31                    [return to present]
36                          [Omar laughs]
58                        [players laugh]
70                      [Aabria laughing]
                       ...               
68960         [Sam makes shushing noises]
69036                             [pause]
69168                         [all laugh]
69242              [glass smashing sound]
70606    [echoing, knocking sound effect]
Name: variable, Length: 839, dtype: object

In [167]:
d20_df['variable'] = d20_df['variable'].str.replace(r'^(\[.*\])$', r'', regex=True).astype('str')

In [168]:
d20_df[d20_df['variable'].str.contains('.*\[.*\].*\[.*\].*\[.*\].*')]

Unnamed: 0,index,variable,episode,season
1845,551,Brennan: I go... [sound of wings flapping] [Br...,1,ACOFF


In [169]:
three_bracks = pd.DataFrame(d20_df[d20_df['variable'].str.contains('.*\[.*\].*\[.*\].*\[.*\].*')])

In [170]:
three_bracks['variable'] = three_bracks['variable'].str.replace(r'.*(\[.*\]).*(\[.*\]).*(\[.*\]).*', r'\1\2\3', regex=True).astype('str')
three = pd.Series(three_bracks['variable'])
three

1845    [sound of wings flapping][Brennan gulps][coughs]
Name: variable, dtype: object

In [171]:
d20_df['variable'] = d20_df['variable'].str.replace(r'(.*)\[.*\].*\[.*\].*\[.*\](.*)', r'\1\2', regex=True).astype('str')

In [172]:
d20_df[d20_df['variable'].str.contains('.*\[.*\].*\[.*\].*\[.*')]

Unnamed: 0,index,variable,episode,season
19195,824,Major Hob: [sighs softly] [inhales and exhales...,8,ACOFF
44881,494,Aabria: Insight check. [Jasper breathes deeply...,6,Burrow's End
47679,3292,cover this big hole. [shout] I'm a mother! [st...,6,Burrow's End


In [173]:
three_cut = pd.DataFrame(d20_df[d20_df['variable'].str.contains('.*\[.*\].*\[.*\].*\[.*')])

In [174]:
three_cut['variable'] = three_cut['variable'].str.replace(r'.*(\[.*\]).*(\[.*\]).*(\[.*)', r'\1\2\3', regex=True).astype('str')
three_cut = pd.Series(three_cut['variable'])
three_cut

19195    [sighs softly][inhales and exhales roughly][sighs
44881       [Jasper breathes deeply][tense music][die taps
47679                         [shout][stone scraping][cast
Name: variable, dtype: object

In [175]:
d20_df['variable'] = d20_df['variable'].str.replace(r'(.*)\[.*\](.*)\[.*\](.*)\[.*', r'\1\2\3', regex=True).astype('str')

In [176]:
#d20_df[d20_df['variable'].str.contains('.*\[.*\].*\[.*\].*')]

In [177]:
two_bracks = pd.DataFrame(d20_df[d20_df['variable'].str.contains('.*\[.*\].*\[.*\].*')])

In [178]:
two_bracks['variable'] = two_bracks['variable'].str.replace(r'.*(\[.*\]).*(\[.*\]).*', r'\1\2', regex=True).astype('str')
two = pd.Series(two_bracks['variable'])
two

2584                                    [laughs][laughs]
2780                         [stammering][clears throat]
3578     [eats the letter without looking at it][pauses]
3917                            [clears throat][frantic]
4042                               [laughs][slams table]
                              ...                       
58485                             [snorts][normal voice]
59654                                  [to Evan][louder]
59692                                  [rolls][laughing]
62857                                [whispered][normal]
64768                               [stuttering][to Sam]
Name: variable, Length: 109, dtype: object

In [179]:
d20_df['variable'] = d20_df['variable'].str.replace(r'(.*)\[.*\](.*)\[.*\](.*)', r'\1\2\3', regex=True).astype('str')

In [180]:
#d20_df[d20_df['variable'].str.contains('.*\[.*\].*\[.*')]

In [181]:
two_cut = pd.DataFrame(d20_df[d20_df['variable'].str.contains('.*\[.*\].*\[.*')])

In [182]:
two_cut['variable'] = two_cut['variable'].str.replace(r'.*(\[.*\]).*(\[.*)', r'\1\2', regex=True).astype('str')
two_cut = pd.Series(two_cut['variable'])
two_cut

1927                                  [muttering][clears
2425     [clears throat, then pauses][clears throat more
37647                             [heart beating][static
37921                            [group exhale][crickets
38010       [yelling][somber music swells, stutters, and
42618                 [Bryan Cranston impression][Jasper
45254                   [blows sharply][players groaning
45547                   [haunting sinister music][Jasper
46589                  [screaming, holding the map][cast
46728                                   [to Siobhan][die
47036                     [tearfully][Brennan and Aabria
47148                        [even louder][Siobhan sinks
48024                                  [to Erika][to the
48386                               [to Siobhan][Siobhan
51284              [Brennan holds up five fingers][glass
51338                               [hoarsely][he clears
54106                    [stammers][cast laughing at the
56591                        [g

In [183]:
d20_df['variable'] = d20_df['variable'].str.replace(r'(.*)\[.*\](.*)\[.*', r'\1\2', regex=True).astype('str')

In [184]:
#d20_df[d20_df['variable'].str.contains('.*\].*\[.*')]

In [185]:
odd_cut = pd.DataFrame(d20_df[d20_df['variable'].str.contains('.*\].*\[.*')])

In [186]:
odd_cut['variable'] = odd_cut['variable'].str.replace(r'(.*\]).*(\[.*)', r'\1\2', regex=True).astype('str')
odd = pd.Series(odd_cut['variable'])
odd

6659                             splashes][Emily laughing]
37350                             makes a face][orchestral
39096                                 music continues][all
42185    looks away][to Jaysohn] You good? There's a lo...
42789    comforts][Aabria cackles] She's asleep! She's ...
46539                             his arms][Aabria laughs]
46751            chuckling][Brennan sighs in exasperation]
47168                               music][die taps table]
51278                             chittering][tape clicks,
56714                                      laughing][group
Name: variable, dtype: object

In [187]:
d20_df['variable'] = d20_df['variable'].str.replace(r'.*\](.*)\[.*', r'\1', regex=True).astype('str')

In [188]:
#d20_df[d20_df['variable'].str.contains('^\[.*\].*')]

In [189]:
lead_sq = pd.DataFrame(d20_df[d20_df['variable'].str.contains('^\[.*\].*')])

In [190]:
lead_sq['variable'] = lead_sq['variable'].str.replace(r'^(\[.*\]).*', r'\1', regex=True).astype('str')
lead_sq = pd.Series(lead_sq['variable'])
lead_sq

1251                        [to Andhera]
1389                   [in normal voice]
3031                            [laughs]
4516                    [Emily laughing]
5086                            [laughs]
                      ...               
60567                      [chef’s kiss]
65071    [holds his arms out behind him]
65529               [holds up the phone]
66634                          [yelling]
68548                         [stutters]
Name: variable, Length: 71, dtype: object

In [191]:
d20_df['variable'] = d20_df['variable'].str.replace(r'^\[.*\](.*)', r'\1', regex=True).astype('str')

In [192]:
#d20_df[d20_df['variable'].str.contains('.*\[.*\]$')]

In [193]:
trail_sq = pd.DataFrame(d20_df[d20_df['variable'].str.contains('.*\[.*\]$')])

In [194]:
trail_sq['variable'] = trail_sq['variable'].str.replace(r'.*(\[.*\])$', r'\1', regex=True).astype('str')
trail_sq = pd.Series(trail_sq['variable'])
trail_sq

418                      [stammering]
540          [he winks, a bell dings]
629           [gasping effort sounds]
689       [Squak gasping frantically]
709                           [sighs]
                     ...             
67019            [nods encouragingly]
67313    [Evan and Jammer chest bump]
67807             [holding his token]
68562                  [silent pause]
69363    [looks at Lou questioningly]
Name: variable, Length: 811, dtype: object

In [195]:
d20_df['variable'] = d20_df['variable'].str.replace(r'(.*)\[.*\]$', r'\1', regex=True).astype('str')

In [196]:
#d20_df[d20_df['variable'].str.contains('.*\[.*\].*')]

In [197]:
mid_sq = pd.DataFrame(d20_df[d20_df['variable'].str.contains('.*\[.*\].*')])

In [198]:
mid_sq['variable'] = mid_sq['variable'].str.replace(r'.*(\[.*\]).*', r'\1', regex=True).astype('str')
mid_sq = pd.Series(mid_sq['variable'])
mid_sq

199                  [laughing]
371                   [singing]
472                    [grunts]
565      [in Major Hob’s voice]
570           [looks at camera]
                  ...          
70446               [intensely]
70483          [in realization]
70529      [in a growing chant]
70832               [deep sigh]
70871        [mock whiny voice]
Name: variable, Length: 1749, dtype: object

In [199]:
d20_df['variable'] = d20_df['variable'].str.replace(r'(.*)\[.*\](.*)', r'\1\2', regex=True).astype('str')

In [200]:
#d20_df[d20_df['variable'].str.contains('.*\[.*')]

In [201]:
print(d20_df.variable[1414])
print(d20_df.variable[4361])
print(d20_df.variable[68391])
print(d20_df.variable[67141])

It is, as ever, the Gloaming here in the Unseelie Court. [thunderstorm
Aabria: And then just hops in a circle, and then kind of... [caws
in. I'll be playing Evan Kelmp. Thanks for being here. [really savouring
Sam: Oh, that's perfect. Yeah, just put it right on the page. [makes


Well isn't this another interesting quirk. Sometimes our bracket phrases are split between lines. 
Okay! No problem, we'll split it as-is per line.

In [202]:
fhalf_sq = pd.DataFrame(d20_df[d20_df['variable'].str.contains('.*\[.*')])

In [203]:
fhalf_sq['variable'] = fhalf_sq['variable'].str.replace(r'.*(\[.*)', r'\1', regex=True).astype('str')
fhalf_sq = pd.Series(fhalf_sq['variable'])
fhalf_sq

1414                                         [thunderstorm
2375     [bird squawking and feather rustling as an ima...
4361                                                 [caws
5265                                               [clears
5676                                                   [to
                               ...                        
66995                              [holds his wand at Sam,
67141                                               [makes
68391                                    [really savouring
68810                                               [glass
69593                                              [gasps,
Name: variable, Length: 180, dtype: object

In [204]:
d20_df['variable'] = d20_df['variable'].str.replace(r'(.*)\[.*', r'\1', regex=True).astype('str')

In [205]:
#d20_df[d20_df['variable'].str.contains('.*\].*')]

In [206]:
bhalf_sq = pd.DataFrame(d20_df[d20_df['variable'].str.contains('.*\].*')])

In [207]:
bhalf_sq['variable'] = bhalf_sq['variable'].str.replace(r'(.*\]).*', r'\1', regex=True).astype('str')
bhalf_sq = pd.Series(bhalf_sq['variable'])
bhalf_sq

1415                                          rumbling]
1928                                            throat]
2376     a crowned pigeon is edited into Aabria’s hand]
2426                                      emphatically]
4362                                           sweetly]
                              ...                      
66996                    which she tentatively mirrors]
67142            peace signs and sticks out her tongue]
68392                                    the syllables]
68811                                   breaking sound]
69594                                    turns to Evan]
Name: variable, Length: 180, dtype: object

In [208]:
d20_df['variable'] = d20_df['variable'].str.replace(r'.*\](.*)', r'\1', regex=True).astype('str')

In [209]:
#d20_df[d20_df['variable'].str.contains('.*\(.*\).*\(.*\).*\(.*\).*')]

In [210]:
trip_round = pd.DataFrame(d20_df[d20_df['variable'].str.contains('.*\(.*\).*\(.*\).*\(.*\).*')])

In [211]:
trip_round['variable'] = trip_round['variable'].str.replace(r'.*(\(.*\)).*(\(.*\)).*(\(.*\)).*', r'\1\2\3', regex=True).astype('str')
trip = pd.Series(trip_round['variable'])
trip

30038       (Spanish accent)(group laughs)(radio crackles)
34540           (group laughs)(Jasper sighs)(group laughs)
34812    (slow music)(Izzy claps)(Chimes for a natural 20)
34902    (breathy)(Brennan groans breathily)(group laug...
34906                         (slap)(groans)(group laughs)
Name: variable, dtype: object

In [212]:
d20_df['variable'] = d20_df['variable'].str.replace(r'(.*)\(.*\)(.*)\(.*\)(.*)\(.*\)(.*)', r'\1\2\3\4', regex=True).astype('str')

In [213]:
trip_cut = pd.DataFrame(d20_df[d20_df['variable'].str.contains('.*\(.*\).*\(.*\).*\(.*$')])

In [214]:
trip_cut['variable'] = trip_cut['variable'].str.replace(r'.*(\(.*\)).*(\(.*\)).*(\(.*)$', r'\1\2\3', regex=True).astype('str')
trip_cut = pd.Series(trip_cut['variable'])
trip_cut

30233                (impressed)(Brennan chortles)(Rashawn
35343    (slow music)(heart beats)(group laughs as Izzy...
53478             (Erika screams)(Aabria screams)(everyone
Name: variable, dtype: object

In [215]:
d20_df['variable'] = d20_df['variable'].str.replace(r'(.*)\(.*\)(.*)\(.*\)(.*)\(.*$', r'\1\2\3', regex=True).astype('str')

In [216]:
final = pd.DataFrame(d20_df[d20_df['variable'].str.contains('.*\).*\(.*\) \(.*$')])

In [217]:
final['variable'] = final['variable'].str.replace(r'(.*\)).*(\(.*\)) (\(.*$)', r'\1\2\3', regex=True).astype('str')
final = pd.Series(final['variable'])
final

34498    out)(inhales)(big
Name: variable, dtype: object

In [218]:
d20_df['variable'] = d20_df['variable'].str.replace(r'.*\)(.*)\(.*\) \(.*$', r'\1', regex=True).astype('str')

In [219]:
#d20_df[d20_df['variable'].str.contains('.*\(.*\).*\(.*\).*')]

In [220]:
dub_round = pd.DataFrame(d20_df[d20_df['variable'].str.contains('.*\(.*\).*\(.*\).*')])

In [221]:
dub_round['variable'] = dub_round['variable'].str.replace(r'.*(\(.*\)).*(\(.*\)).*', r'\1\2', regex=True).astype('str')
dub = pd.Series(dub_round['variable'])
dub

23912                         (Aabria laughs)(Rick laughs)
23954          (all laugh)(speakers talk over one another)
23967                       (Denise laughs)(Aabria laughs)
24016                         (Aabria laughs)(Rick laughs)
24040                       (Aabria laughs)(Denise laughs)
24286                                  (all laugh)(laughs)
24504                       (Rick chuckles)(Aabria laughs)
24985                               (laughs)(group laughs)
25886                                (laughter)(crosstalk)
25906                           (group laughing)(cheering)
29825                                   (choked)(in shock)
30171                        (Erika laughs)(twinkly music)
30238      (as Tara)(Erika cackles at Jasper's impression)
30245                  (As Tara)(Brennan and Aabria laugh)
30955                       (lightning cracks)(dice rolls)
31042                      (Aabria laughs)(dramatic music)
31650       (group cheers a critical roll)(dramatic musi

In [222]:
d20_df['variable'] = d20_df['variable'].str.replace(r'(.*)\(.*\)(.*)\(.*\)(.*)', r'\1\2\3', regex=True).astype('str')

In [223]:
dub_cut = pd.DataFrame(d20_df[d20_df['variable'].str.contains('^.*\).*\(.*\).*')])

In [224]:
dub_cut['variable'] = dub_cut['variable'].str.replace(r'^(.*\)).*(\(.*\)).*', r'\1\2', regex=True).astype('str')
dub_cut = pd.Series(dub_cut['variable'])
dub_cut

24491                                 laughs)(Rick laughs)
30562                        anticipation)(dramatic music)
30564            crumbling and collapsing)(dramatic music)
30623    delight)(Various exclamations of awe and disgust)
31559            excitement)(Players cheering for Rashawn)
Name: variable, dtype: object

In [225]:
d20_df['variable'] = d20_df['variable'].str.replace(r'^.*\)(.*)\(.*\)(.*)', r'\1\2', regex=True).astype('str')

In [226]:
bdub_cut = pd.DataFrame(d20_df[d20_df['variable'].str.contains('.*\(.*\).*\(.*$')])

In [227]:
bdub_cut['variable'] = bdub_cut['variable'].str.replace(r'.*(\(.*\)).*(\(.*)$', r'\1\2', regex=True).astype('str')
bdub_cut = pd.Series(bdub_cut['variable'])
bdub_cut

26752                               (group laughing)(group
29965                   (congested)(group laughing at this
30622    (all scream in horror and disgust)(Erika laugh...
30712                    (players laughing)(Erika imitates
30919                    (Jasper)(Other players grimace at
34497                        (deep breath in)(tight breath
35703                              (players groaning)(bear
35790                                       (chirps)(group
36043                    (dramatic electronic music)(group
36270                                 (group laughs)(group
36413                (dice rattling)(slow electronic music
40631                                  (under breath)(dice
Name: variable, dtype: object

In [228]:
d20_df['variable'] = d20_df['variable'].str.replace(r'(.*)\(.*\)(.*)\(.*$', r'\1\2', regex=True).astype('str')

In [229]:
#d20_df[d20_df['variable'].str.contains('^\(.*\)$')]

In [230]:
r_bracks = pd.DataFrame(d20_df[d20_df['variable'].str.contains('^\(.*\)$')])

In [231]:
r_bracks['variable'] = r_bracks['variable'].str.replace(r'.*(\(.*\)).*(\(.*\)).*', r'\1\2', regex=True).astype('str')
rbrk = pd.Series(r_bracks['variable'])
rbrk

23689               (all cheering)
23776              (Rick chuckles)
23962                  (all laugh)
23985                     (laughs)
24438               (Ebony laughs)
                   ...            
74790    (talking over each other)
74827               (all laughing)
74855                  (all laugh)
74914                     (laughs)
75691            (distant banging)
Name: variable, Length: 321, dtype: object

In [232]:
d20_df['variable'] = d20_df['variable'].str.replace(r'^\(.*\)$', r'', regex=True).astype('str')

In [233]:
#d20_df[d20_df['variable'].str.contains('.*\(.*\)$')]

In [234]:
r_trail = pd.DataFrame(d20_df[d20_df['variable'].str.contains('.*\(.*\)$')])

In [235]:
r_trail['variable'] = r_trail['variable'].str.replace(r'.*(\(.*\))$', r'\1', regex=True).astype('str')
r_trail = pd.Series(r_trail['variable'])
r_trail

23831      (chuckles)
23839        (laughs)
23846        (laughs)
23849        (laughs)
23976    (indistinct)
             ...     
75504        (laughs)
75520        (laughs)
75587        (laughs)
75796        (laughs)
75888        (laughs)
Name: variable, Length: 549, dtype: object

In [236]:
d20_df['variable'] = d20_df['variable'].str.replace(r'(.*)\(.*\)$', r'\1', regex=True).astype('str')

In [237]:
#d20_df[d20_df['variable'].str.contains('^\(.*\).*')]

In [238]:
r_lead = pd.DataFrame(d20_df[d20_df['variable'].str.contains('^\(.*\).*')])

In [239]:
r_lead['variable'] = r_lead['variable'].str.replace(r'^(\(.*\)).*', r'\1', regex=True).astype('str')
r_lead = pd.Series(r_lead['variable'])
r_lead

25120                                       (group laughs)
26913                                           (laughing)
26948                                     (group laughing)
26949                                   (Brennan laughing)
27036                                     (Emily laughing)
27115                                           (applause)
29358                                         (Omar gasps)
30761                                (echoing dual voiced)
31682                       (Rashawn and Jasper slap five)
32267                                            (she/her)
33001                                            (she/her)
35336                                      (flesh tearing)
35350                                        (heart beats)
35401                                       (group laughs)
35403                                           (stammers)
36567                                             (laughs)
36603                                       (group laugh

In [240]:
d20_df['variable'] = d20_df['variable'].str.replace(r'^\(.*\)(.*)', r'\1', regex=True).astype('str')

In [241]:
#d20_df[d20_df['variable'].str.contains('.*\(.*\).*')]

In [242]:
mid_r = pd.DataFrame(d20_df[d20_df['variable'].str.contains('.*\(.*\).*')])

In [243]:
mid_r['variable'] = mid_r['variable'].str.replace(r'.*(\(.*\)).*', r'\1', regex=True).astype('str')
mid_r = pd.Series(mid_r['variable'])
mid_r

23553    (Aabria coughs)
23726         (laughing)
23797    (Rick chuckles)
23922           (laughs)
24001           (laughs)
              ...       
75392           (laughs)
75573           (laughs)
75586           (grunts)
75867           (laughs)
75979           (laughs)
Name: variable, Length: 501, dtype: object

In [244]:
d20_df['variable'] = d20_df['variable'].str.replace(r'(.*)\(.*\)(.*)', r'\1\2', regex=True).astype('str')

In [245]:
#d20_df[d20_df['variable'].str.contains('\(')]

In [246]:
fhalf_r = pd.DataFrame(d20_df[d20_df['variable'].str.contains('.*\(.*')])

In [247]:
fhalf_r['variable'] = fhalf_r['variable'].str.replace(r'.*(\(.*)', r'\1', regex=True).astype('str')
fhalf_r = pd.Series(fhalf_r['variable'])
fhalf_r

24351                (all
24490              (Ebony
24610              (group
26876              (Emily
27023              (group
               ...       
64925           (dramatic
65962             (clears
66811    (cute laugh that
73473               (hand
74871         (mimics gun
Name: variable, Length: 78, dtype: object

In [248]:
d20_df['variable'] = d20_df['variable'].str.replace(r'(.*)\(.*', r'\1', regex=True).astype('str')

In [249]:
#d20_df[d20_df['variable'].str.contains('\)')]

In [250]:
bhalf_r = pd.DataFrame(d20_df[d20_df['variable'].str.contains('.*\).*')])

In [251]:
bhalf_r['variable'] = bhalf_r['variable'].str.replace(r'(.*\)).*', r'\1', regex=True).astype('str')
bhalf_r = pd.Series(bhalf_r['variable'])
bhalf_r

24352                                  laugh)
24611                                 laughs)
26753                               laughing)
26877                               laughing)
27024                               laughing)
                         ...                 
65963                                 throat)
66812    turns into a classic witch’s cackle)
73474                                  smack)
74155                       the horn gesture)
74872                                  noise)
Name: variable, Length: 85, dtype: object

In [252]:
d20_df['variable'] = d20_df['variable'].str.replace(r'.*\)(.*)', r'\1', regex=True).astype('str')

In [253]:
sounds = [small, mix_b, rsq, sq, three, three_cut, odd, two, two_cut, lead_sq, trail_sq, mid_sq, fhalf_sq, bhalf_sq, trip, trip_cut, final, dub, dub_cut, bdub_cut, rbrk, r_trail, odd_r, r_lead, mid_r, fhalf_r, bhalf_r]

In [254]:
noises = pd.concat(sounds)

In [255]:
duplicates = noises.index[noises.index.duplicated(keep=False)]

In [256]:
print(duplicates)

Int64Index([], dtype='int64')


In [257]:
d20_df['nonspeech'] = noises

In [258]:
d20_df['nonspeech'] = d20_df['nonspeech'].fillna('')
d20_df['inaudible_speech'] = ''

In [259]:
d20_df.head()

Unnamed: 0,index,variable,episode,season,nonspeech,inaudible_speech
0,10,"Aabria: To the churlish Captain K.P. Hob, vene...",10,ACOFF,,
1,11,"Goblin Court. To the master of ceremonies, Del...",10,ACOFF,,
2,12,"the Court of Wonder. To BINX Choppley, sole su...",10,ACOFF,,
3,13,"Craft. To the tenebrous Prince Andhera, scion ...",10,ACOFF,,
4,14,"to the notorious Lords of the Wing, Lady Chirp...",10,ACOFF,,


So there's our inaudible speech and nonspeech columns! There are actually no inaudible markings in the D20 data but to be consistent with the other DF I've added the column anyway. I'll rearrange columns once I've finished all my regex work

### Creating the Name column

This became quite the process as I worked on it, and I'll elaborate on the step by steps below as I organize them, but the basic premise is the following
- identify and correct any problem lines
    - these were discovered later in the process, and needed to be fixed before the processing the names column
- find and isolate all instances of names, which look like "Name:"
- assign names to new column and use ffill to populate following rows for the correct speaker

In [260]:
#first here are some instances of editor notes, we'll just get rid of those
d20_df[d20_df['variable'].str.contains('Note:')]

Unnamed: 0,index,variable,episode,season,nonspeech,inaudible_speech
41413,36,Ed. Note: This is a reference to D20 season Mi...,5,Burrow's End,,
45838,1451,Note: “bless their heart” is a very specific S...,6,Burrow's End,,


In [261]:
d20_df = d20_df.drop(45838)
d20_df = d20_df.drop(41413)

In [262]:
#there was an instance of lines being oddly coded, so I've split them correctly
#I do think in this case it was an interruption so I coded Aabria's line to end with a hyphen
d20_df[d20_df['variable'].str.contains('.*Erika:.*Team in$')]

Unnamed: 0,index,variable,episode,season,nonspeech,inaudible_speech
29987,280,Aabria: You're getting sort of chatter which i...,10,Burrow's End,,


In [263]:
d20_df.variable[29987]

"Aabria: You're getting sort of chatter which is likErika:\xa0“Bravo Team in"

In [264]:
d20_df['variable'] = d20_df['variable'].str.replace(r'(.*)Erika:.*Team in$', r'\1-', regex=True).astype('str')

In [265]:
d20_df[d20_df['variable'].str.contains('place. We\'re on standby for orders. Everything is looking good.” You\'re')]

Unnamed: 0,index,variable,episode,season,nonspeech,inaudible_speech
29988,281,place. We're on standby for orders. Everything...,10,Burrow's End,,


In [266]:
d20_df['variable'] = d20_df['variable'].str.replace(r'(place. We\'re on standby for orders. Everything is looking good.” You\'re)', r'Erika: “Bravo Team in \1', regex=True).astype('str')

In [267]:
d20_df[d20_df['variable'].str.contains('Dexterity:')]

Unnamed: 0,index,variable,episode,season,nonspeech,inaudible_speech
32239,125,19. Strength: 13. Dexterity: 19. Constitution:...,1,Burrow's End,,
32268,154,class: 14. Strength: 9. Dexterity: 16. Constit...,1,Burrow's End,,
32584,470,Wanderer / Level 4. Armor class: 14. Strength:...,1,Burrow's End,,
32690,576,Armor class: 13. Strength: 14. Dexterity: 16. ...,1,Burrow's End,,
33003,889,Dexterity: 15. Constitution: 19. Intelligence:...,1,Burrow's End,,
33015,901,Level 4. Armor class: 18. Strength: 13. Dexter...,1,Burrow's End,,


In [268]:
#these are all on screen texts featuring character stats
d20_df = d20_df.drop(33001)
d20_df = d20_df.drop(33002)
d20_df = d20_df.drop(33003)
d20_df = d20_df.drop(33004)

d20_df = d20_df.drop(32237)
d20_df = d20_df.drop(32238)
d20_df = d20_df.drop(32239)
d20_df = d20_df.drop(32240)

d20_df = d20_df.drop(32266)
d20_df = d20_df.drop(32267)
d20_df = d20_df.drop(32268)
d20_df = d20_df.drop(32269)

d20_df = d20_df.drop(32582)
d20_df = d20_df.drop(32583)
d20_df = d20_df.drop(32584)
d20_df = d20_df.drop(32585)

d20_df = d20_df.drop(32688)
d20_df = d20_df.drop(32689)
d20_df = d20_df.drop(32690)
d20_df = d20_df.drop(32691)

d20_df = d20_df.drop(33013)
d20_df = d20_df.drop(33014)
d20_df = d20_df.drop(33015)
d20_df = d20_df.drop(33016)

#various on screen texts I've spotted in my name column building

d20_df = d20_df.drop(11850)
d20_df = d20_df.drop(29955)
d20_df = d20_df.drop(49395)
d20_df = d20_df.drop(42729)
d20_df = d20_df.drop(47793)

In [269]:
#these following lines were being captured in the name column captures, so I've replaced the : with a -
d20_df.variable[6240]

'normal."  What can I start about Gwyndolin: nothing.'

In [270]:
d20_df[d20_df['variable'].str.contains('normal."  What can I start about Gwyndolin: nothing\.')]

Unnamed: 0,index,variable,episode,season,nonspeech,inaudible_speech
6240,239,"normal."" What can I start about Gwyndolin: no...",3,ACOFF,[group laughing],


In [271]:
d20_df['variable'] = d20_df['variable'].str.replace(r'normal."  What can I start about Gwyndolin: nothing\.', r'normal."  What can I start about Gwyndolin- nothing\.', regex=True).astype('str')

In [272]:
d20_df.variable[32133]

"Iyengar. I will be your Dungeon Master for this: Burrow's End."

In [273]:
d20_df['variable'] = d20_df['variable'].str.replace(r'Party:\xa0Stoatal Recall\.\"', r'Party- Stoatal Recall."', regex=True).astype('str')

In [274]:
d20_df[d20_df['variable'].str.contains('Iyengar. I will be your Dungeon Master for this: Burrow\'s End.')]

Unnamed: 0,index,variable,episode,season,nonspeech,inaudible_speech
32133,19,Iyengar. I will be your Dungeon Master for thi...,1,Burrow's End,,


In [275]:
d20_df['variable'] = d20_df['variable'].str.replace(r'Iyengar. I will be your Dungeon Master for this: Burrow\'s End.', r"Iyengar. I will be your Dungeon Master for this- Burrow's End.", regex=True).astype('str')

In [276]:
#same with these because of the times. I've made : into a ; and will revert later on
d20_df[d20_df['variable'].str.contains('Side where Whitney Jammer… Where would this owl find you at like, 3:00')]

Unnamed: 0,index,variable,episode,season,nonspeech,inaudible_speech
57109,81,Side where Whitney Jammer… Where would this ow...,1,Misfits and Magic,,


In [277]:
d20_df['variable'] = d20_df['variable'].str.replace(r'Side where Whitney Jammer… Where would this owl find you at like, 3:00', r"Side where Whitney Jammer… Where would this owl find you at like, 3 o'clock", regex=True).astype('str')

In [278]:
d20_df[d20_df['variable'].str.contains('As of course something that the camera sees that you do not: the scrape')]

Unnamed: 0,index,variable,episode,season,nonspeech,inaudible_speech
69739,1379,As of course something that the camera sees th...,5,Misfits and Magic,,


In [279]:
d20_df['variable'] = d20_df['variable'].str.replace(r'As of course something that the camera sees that you do not: the scrape', r"As of course something that the camera sees that you do not- the scrape", regex=True).astype('str')

In [280]:
d20_df.variable[51257]

'Accident in Reactor Charlie. At 11:20, all plant personnel were notified'

In [281]:
d20_df[d20_df['variable'].str.contains('Accident in Reactor Charlie. At 11:20, all plant personnel were notified')]

Unnamed: 0,index,variable,episode,season,nonspeech,inaudible_speech
51257,1632,"Accident in Reactor Charlie. At 11:20, all pla...",8,Burrow's End,,


In [282]:
d20_df['variable'] = d20_df['variable'].str.replace(r'Accident in Reactor Charlie. At 11:20, all plant personnel were notified', r"Accident in Reactor Charlie. At 11;20, all plant personnel were notified", regex=True).astype('str')

In [283]:
d20_df[d20_df['variable'].str.contains('Dr. Wenabocker. It is 4/21/62 at 8:11. The LOC logged on the 18th has')]

Unnamed: 0,index,variable,episode,season,nonspeech,inaudible_speech
44089,2712,Dr. Wenabocker. It is 4/21/62 at 8:11. The LOC...,5,Burrow's End,,


In [284]:
d20_df['variable'] = d20_df['variable'].str.replace(r'Dr. Wenabocker. It is 4/21/62 at 8:11. The LOC logged on the 18th has', r"Dr. Wenabocker. It is 4/21/62 at 8;11. The LOC logged on the 18th has", regex=True).astype('str')

In [285]:
d20_df[d20_df['variable'].str.contains('It is 2:')]

Unnamed: 0,index,variable,episode,season,nonspeech,inaudible_speech
51271,1646,"It is 2:22, two hours post order of a full eva...",8,Burrow's End,,


In [286]:
d20_df['variable'] = d20_df['variable'].str.replace(r'It is 2:22, two hours post order of a full evacuation of Peace Plant by', r"It is 2;22, two hours post order of a full evacuation of Peace Plant by", regex=True).astype('str')

In [287]:
d20_df['variable'] = d20_df['variable'].str.replace(r'Part of Evan:\xa0I know exactly who I\'m talking to, and your time hasn\'t', r"Evan: I know exactly who I'm talking to, and your time hasn't", regex=True).astype('str')

In [288]:
d20_df['variable'] = d20_df['variable'].str.replace(r'Hello, this is Dr\. Wenabocker\. It is 4/21/62 at 8:11\. ', r"Hello, this is Dr. Wenabocker. It is 4/21/62 at 8;11. ", regex=True).astype('str')

This was a case I saw a few of as well. I suspect the editors of the text files maybe did a find and replace somehow, because in the cases I saw it was always an overlap at one letter: Member/Rashawn overlapping at the R, 

In [289]:
d20_df[d20_df['variable'].str.contains('Crew memb')]

Unnamed: 0,index,variable,episode,season,nonspeech,inaudible_speech
54505,28,Crew membeRashawn: Chipmunks and the PCs.,2,Burrow's End AP,(offstage),


In [290]:
d20_df['variable'] = d20_df['variable'].str.replace(r'Crew membeRashawn:  Chipmunks and the PCs\.', r"Crew member:  Chipmunks and the PCs.", regex=True).astype('str')

In [291]:
d20_df['variable'] = d20_df['variable'].str.replace(r'JaspeRashawn', r"Jasper", regex=True).astype('str')

In [292]:
d20_df['variable'] = d20_df['variable'].str.replace(r'PhoebErika', r"Phoebe", regex=True).astype('str')

In this instance, the speech was tagged with a ? - I confirmed it was Evan speaking and corrected the lines in question

In [293]:
d20_df['variable'] = d20_df['variable'].str.replace(r'Evan\?:(.*)', r"Evan:\1", regex=True).astype('str')

These are fan created transcriptions and they're a good team doing good work, but they're not perfect! There were a bunch of lines that had partial names or no character names. Between the website and skimming the episodes, I confirmed who was speaking at the times I found this, and corrected those instances to be correctly associated with the name it should be

In [294]:
d20_df['variable'] = d20_df['variable'].str.replace(r'^Tu:(.*)', r"Tula:\1", regex=True).astype('str')

In [295]:
d20_df['variable'] = d20_df['variable'].str.replace(r'^ra:(.*)', r"Rashawn:\1", regex=True).astype('str')

In [296]:
d20_df['variable'] = d20_df['variable'].str.replace(r'q:(.*)', r'Q:\1', regex=True).astype('str')

In [297]:
d20_df['variable'] = d20_df['variable'].str.replace(r'^a:(.*)', r'Aabria:\1', regex=True).astype('str')

In [298]:
d20_df['variable'] = d20_df['variable'].str.replace(r'^A:(.*)', r'Aabria:\1', regex=True).astype('str')

In [299]:
d20_df['variable'] = d20_df['variable'].str.replace(r'^c:(.*)', r'Carlos:\1', regex=True).astype('str')

In [300]:
d20_df['variable'] = d20_df['variable'].str.replace(r': It\'s really crazy, so just try to make sure that Jaysohn doesn\'t get', r"Tula: It's really crazy, so just try to make sure that Jaysohn doesn't get", regex=True).astype('str')

In [301]:
d20_df['variable'] = d20_df['variable'].str.replace(r': With that final glow, the entire pool seems to get more vibrant\.', r"Aabria: With that final glow, the entire pool seems to get more vibrant.", regex=True).astype('str')

In [302]:
d20_df['variable'] = d20_df['variable'].str.replace(r': Swinging twice, regular attacks, on this dude\. There\'s no shenanigans', r"Brennan: Swinging twice, regular attacks, on this dude. There's no shenanigans", regex=True).astype('str')

In [303]:
d20_df['variable'] = d20_df['variable'].str.replace(r': As they stand back up, they just put their foot over your throat and', r"Aabria: As they stand back up, they just put their foot over your throat and", regex=True).astype('str')

In [304]:
d20_df['variable'] = d20_df['variable'].str.replace(r': Son of a-', r"Aabria: Son of a-", regex=True).astype('str')

In [305]:
d20_df['variable'] = d20_df['variable'].str.replace(r':  My dress glows\.', r"Model:  My dress glows.", regex=True).astype('str')

Some lines also had Name(space):, possibly a space created by the removal of a (sound) up above. I'll get rid of those here so again they'll be properly captured in the name processing below. 

In [306]:
 d20_df['variable'] = d20_df['variable'].str.replace(r'Lou\ :', r'Lou:', regex=True).astype('str')

In [307]:
 d20_df['variable'] = d20_df['variable'].str.replace(r'Jasper\ :', r'Jasper:', regex=True).astype('str')

In [308]:
 d20_df['variable'] = d20_df['variable'].str.replace(r'Aabria\ :', r'Aabria:', regex=True).astype('str')

In [309]:
 d20_df['variable'] = d20_df['variable'].str.replace(r'Brennan\ :', r'Brennan:', regex=True).astype('str')

In [310]:
 d20_df['variable'] = d20_df['variable'].str.replace(r'Izzy\ :', r'Izzy:', regex=True).astype('str')

In [311]:
 d20_df['variable'] = d20_df['variable'].str.replace(r'Sybil\ :', r'Sybil:', regex=True).astype('str')

In [312]:
 d20_df['variable'] = d20_df['variable'].str.replace(r'Jaysoh\ :', r'Jaysohn:', regex=True).astype('str')

In [313]:
 d20_df['variable'] = d20_df['variable'].str.replace(r'tErika:', r'Erika:', regex=True).astype('str')

In [314]:
 d20_df['variable'] = d20_df['variable'].str.replace(r'bErika:', r'Erika:', regex=True).astype('str')

In [315]:
 d20_df['variable'] = d20_df['variable'].str.replace(r'Brenna:', r'Brennan:', regex=True).astype('str')

In [316]:
 d20_df['variable'] = d20_df['variable'].str.replace(r'Brennna:', r'Brennan:', regex=True).astype('str')

#### Compile names

Now that those are treated, let's capture those and add them into a series of their own

In [317]:
singlename = pd.DataFrame(d20_df[d20_df['variable'].str.contains('^[A-Z][a-z]*:.*')])

In [318]:
singlename['variable'] = singlename['variable'].str.replace(r'^([A-Z][a-z]*:).*', r'\1', regex=True).astype('str')
singlename = pd.Series(singlename['variable'])
singlename

0        Aabria:
10          All:
11       Aabria:
13        Emily:
14       Aabria:
          ...   
76005    Aabria:
76006     Orion:
76007    Aabria:
76008     David:
76010    Aabria:
Name: variable, Length: 43227, dtype: object

In [319]:
d20_df['variable'] = d20_df['variable'].str.replace(r'^[A-Z][a-z]*:(.*)', r'\1', regex=True).astype('str')

In [320]:
#d20_df[d20_df['variable'].str.contains('^[A-Z][a-z]*.*[A-Z]*[a-z]*:.*')]

In [321]:
longnames = pd.DataFrame(d20_df[d20_df['variable'].str.contains('^[A-Z][a-z]*.*[A-Z]*[a-z]*:.*')])

In [322]:
longnames['variable'] = longnames['variable'].str.replace(r'^([A-Z][a-z]*.*[A-Z]*[a-z]*:).*', r'\1', regex=True).astype('str')
longnames = pd.Series(longnames['variable'])
longnames

183                           BINX:
213                      Major Hob:
235                    Lou & Emily:
254                      Major Hob:
257                      Major Hob:
                    ...            
73361              Lou and Brennan:
73459          Aabria and Danielle:
73837           Aabria and Brennan:
74115              Lou and Brennan:
74772    Someone behind the camera:
Name: variable, Length: 1559, dtype: object

In [323]:
d20_df['variable'] = d20_df['variable'].str.replace(r'^[A-Z][a-z]*.*[A-Z]*[a-z]*:(.*)', r'\1', regex=True).astype('str')

In [324]:
shortnames = pd.DataFrame(d20_df[d20_df['variable'].str.contains('[A-Z][a-z]*:.*')])

In [325]:
shortnames['variable'] = shortnames['variable'].str.replace(r'([A-Z][a-z]*:).*', r'\1', regex=True).astype('str')
shortnames = pd.Series(shortnames['variable'])
shortnames

75               Suntar:
76                Chirp:
77               Suntar:
78                Chirp:
79               Suntar:
              ...       
60154              Evan:
60511             Digby:
63014              Evan:
68850               Sam:
70777             Oggle:
Name: variable, Length: 423, dtype: object

In [326]:
d20_df['variable'] = d20_df['variable'].str.replace(r'[A-Z][a-z]*:(.*)', r'\1', regex=True).astype('str')

In [327]:
nameslist = [singlename, longnames, shortnames]

In [328]:
names = pd.concat(nameslist) #concat the series together

In [329]:
namedupes = names.index[names.index.duplicated(keep=False)] #confirm there are no duplicate index values

In [330]:
print(namedupes)

Int64Index([], dtype='int64')


In [331]:
d20_df['name'] = names

Now fill in the following lines with the name above until it comes upon a new value.

In [332]:
d20_df['name']= d20_df['name'].mask(d20_df['name']=='NaN', None).ffill()
d20_df

Unnamed: 0,index,variable,episode,season,nonspeech,inaudible_speech,name
0,10,"To the churlish Captain K.P. Hob, venerated s...",10,ACOFF,,,Aabria:
1,11,"Goblin Court. To the master of ceremonies, Del...",10,ACOFF,,,Aabria:
2,12,"the Court of Wonder. To BINX Choppley, sole su...",10,ACOFF,,,Aabria:
3,13,"Craft. To the tenebrous Prince Andhera, scion ...",10,ACOFF,,,Aabria:
4,14,"to the notorious Lords of the Wing, Lady Chirp...",10,ACOFF,,,Aabria:
...,...,...,...,...,...,...,...
76015,1244,your hard work. This is literally what we mean...,5,Misfits and Magic AP,,,Aabria:
76016,1245,"collaborative storytelling, We all built this ...",5,Misfits and Magic AP,,,Aabria:
76017,1246,"an honor and a privilege to get to build ""Misf...",5,Misfits and Magic AP,,,Aabria:
76018,1247,"it right one time, thank God.",5,Misfits and Magic AP,,,Aabria:


We know from far earlier in this notebook that Aabria's opening speech goes from index 0-9 where she says hello and is responded to in index 10 by "All", so let's confirm that the name values reflect this

In [333]:
print(d20_df.name[9])
print(d20_df.name[10])  #great!!

Aabria:
All:


Now let's check out some basic counts

In [334]:
d20_df['name'].value_counts()

Aabria:                 20145
Brennan:                 7050
Lou:                     3281
Erika:                   2488
Oscar:                   2098
                        ...  
Cater:                      1
Ava :                       1
Olliver :                   1
Thoen:                      1
Brennan and Siobhan:        1
Name: name, Length: 303, dtype: int64

In [335]:
d20_df['name'] = d20_df['name'].str.replace(r'(.*):', r'\1', regex=True).astype('str') #get rid of those trailing colons

In [336]:
d20_df['name'].value_counts()

Aabria                 20145
Brennan                 7050
Lou                     3281
Erika                   2488
Oscar                   2098
                       ...  
Cater                      1
Ava                        1
Olliver                    1
Thoen                      1
Brennan and Siobhan        1
Name: name, Length: 303, dtype: int64

#### Correcting typo errors

While working on the player column below, I found quite a number of misspellings of names. I'm correcting them here so that when it comes time to compile and replace character names with player names, we'll have a consistent starting point.

In [337]:
d20_df['name'] = d20_df['name'].str.replace(r'Erka', r'Erika', regex=True).astype('str')

In [338]:
d20_df['name'] = d20_df['name'].str.replace(r'Sureana', r"Surena", regex=True).astype('str')

In [339]:
d20_df['name'] = d20_df['name'].str.replace(r'Aasbria', r'Aabria', regex=True).astype('str')

In [340]:
d20_df['name'] = d20_df['name'].str.replace(r'eRashawn', r'Rashawn', regex=True).astype('str')

In [341]:
d20_df['name'] = d20_df['name'].str.replace(r'Brenan', r'Brennan', regex=True).astype('str')

In [342]:
d20_df['name'] = d20_df['name'].str.replace(r'Thoen', r'Thorn', regex=True).astype('str')

In [343]:
d20_df['name'] = d20_df['name'].str.replace(r'^Bren$', r'Brennan', regex=True).astype('str')

In [344]:
d20_df['name'] = d20_df['name'].str.replace(r'Rashan', r'Rashawn', regex=True).astype('str')

In [345]:
d20_df['name'] = d20_df['name'].str.replace(r'IZzy', r'Izzy', regex=True).astype('str')

In [346]:
d20_df['name'] = d20_df['name'].str.replace(r'Jayasohn', r'Jaysohn', regex=True).astype('str')

In [347]:
 d20_df['name'] = d20_df['name'].str.replace(r'Arabia', r'Aabria', regex=True).astype('str')

In [348]:
 d20_df['name'] = d20_df['name'].str.replace(r'Dammer', r'Jammer', regex=True).astype('str')

In [349]:
d20_df['name'] = d20_df['name'].str.replace(r'Yula', r'Tula', regex=True).astype('str')

In [350]:
d20_df['name'] = d20_df['name'].str.replace(r'Jayson', r'Jaysohn', regex=True).astype('str')

In [351]:
d20_df['name'] = d20_df['name'].str.replace(r'^Boodl$', r'Boodle', regex=True).astype('str')

In [352]:
 d20_df['name'] = d20_df['name'].str.replace(r'Grablaba', r'Grabalba', regex=True).astype('str')

In [353]:
 d20_df['name'] = d20_df['name'].str.replace(r'Aabrua', r'Aabria', regex=True).astype('str')

In [354]:
 d20_df['name'] = d20_df['name'].str.replace(r'Digby', r'Digsby', regex=True).astype('str')

In [355]:
 d20_df['name'] = d20_df['name'].str.replace(r'Mela', r'Mila', regex=True).astype('str')

In [356]:
 d20_df['name'] = d20_df['name'].str.replace(r'^Lil$', r'Lila', regex=True).astype('str')

In [357]:
 d20_df['name'] = d20_df['name'].str.replace(r'^Khan$', r'Khanh', regex=True).astype('str')

In [358]:
 d20_df['name'] = d20_df['name'].str.replace(r'Cater', r'Carter', regex=True).astype('str')

In [359]:
 d20_df['name'] = d20_df['name'].str.replace(r'AlLila', r'All', regex=True).astype('str')

In [360]:
 d20_df['name'] = d20_df['name'].str.replace(r'Bennet$', r'Bennett', regex=True).astype('str')

In [361]:
 d20_df['name'] = d20_df['name'].str.replace(r'oLila', r'Bennett', regex=True).astype('str')

In [362]:
 d20_df['name'] = d20_df['name'].str.replace(r'Whitner', r'Jammer', regex=True).astype('str')

In [363]:
 d20_df['name'] = d20_df['name'].str.replace(r'Erika\ $', r'Erika', regex=True).astype('str')

### Player Name column

Since these transcriptions use character names to differentiate between when a character is speaking vs when the player is speaking, that's not very helpful in getting us value information on how much different people are speaking. 

The process for the player name column is:
- Identify the unique values in the newly cloned player column
- Replace values as appropriate to the player associated with the character name
    - all assignments were confirmed using the fan wiki and scanning episodes online

In [364]:
d20_df['player']=d20_df['name']
d20_df.head()

Unnamed: 0,index,variable,episode,season,nonspeech,inaudible_speech,name,player
0,10,"To the churlish Captain K.P. Hob, venerated s...",10,ACOFF,,,Aabria,Aabria
1,11,"Goblin Court. To the master of ceremonies, Del...",10,ACOFF,,,Aabria,Aabria
2,12,"the Court of Wonder. To BINX Choppley, sole su...",10,ACOFF,,,Aabria,Aabria
3,13,"Craft. To the tenebrous Prince Andhera, scion ...",10,ACOFF,,,Aabria,Aabria
4,14,"to the notorious Lords of the Wing, Lady Chirp...",10,ACOFF,,,Aabria,Aabria


In [365]:
print(d20_df['player'].unique())

['Aabria' 'All' 'Emily' 'Lou' 'Omar' 'Oscar' 'Brennan'
 '\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0Suntar'
 '\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0Chirp' 'Suntar' 'Apollo' 'Squak'
 'Surena' 'Chirp' 'BINX' 'Major Hob' 'Lou & Emily' 'Rue' 'Andhera'
 '\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0Andhera'
 '\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0BINX'
 '\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0Apollo'
 '\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0Squak'
 '\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0Major Hob'
 '\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0Rue'
 '\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0Wuvvy' 'Major Hob & Squak'
 '\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0Jeremy Renner' 'Jeremy Renner' 'Advisor'
 '\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0Advisor' 'Wuvvy' 'Grandfather'
 'Lou and Oscar' 'Mika' '\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0Gwyndolin'
 '\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0Captain Hob' 'Captain Hob'
 '\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0Goblin King' 'Goblin King'
 'Injured Goblin' 'Viscountess Grabalba' 'Gwyndolin' 'Bird' 'Crowd'
 '\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0Onlooker' 'Satisfied Perso

In [366]:
d20_df[d20_df['name'].str.contains('^K$')]

Unnamed: 0,index,variable,episode,season,nonspeech,inaudible_speech,name,player
66874,1764,Yeah.,4,Misfits and Magic,,,K,K
66908,1798,Good!,4,Misfits and Magic,,,K,K
66913,1803,Whoo!,4,Misfits and Magic,,,K,K
66927,1817,"You have blanket consent, my friend.",4,Misfits and Magic,,,K,K
66930,1820,Also I'm experimenting with they/them!,4,Misfits and Magic,,,K,K
...,...,...,...,...,...,...,...,...
71515,3155,"All right, everybody let's keep it going.",5,Misfits and Magic,,,K,K
71623,3263,"Oh, God, okay!",5,Misfits and Magic,,,K,K
71636,3276,You have a beautiful voice.,5,Misfits and Magic,,,K,K
71692,3332,Like right through the carotid.,5,Misfits and Magic,,,K,K


In [367]:
d20_df['player'] = d20_df['player'].str.replace(r'^K$', r'Erika', regex=True).astype('str')

In [368]:
d20_df['player'] = d20_df['player'].str.replace(r'Evan', r'Brennan', regex=True).astype('str')

In [369]:
d20_df['player'] = d20_df['player'].str.replace(r'Captain Hob', r'Brennan', regex=True).astype('str')

In [370]:
d20_df['player'] = d20_df['player'].str.replace(r'Jaysohn', r'Siobhan', regex=True).astype('str')

In [371]:
d20_df['player'] = d20_df['player'].str.replace(r'Tula', r'Brennan', regex=True).astype('str')

In [372]:
d20_df['player'] = d20_df['player'].str.replace(r'Jammer', r'Lou', regex=True).astype('str')

In [373]:
d20_df['player'] = d20_df['player'].str.replace(r'Lila', r'Izzy', regex=True).astype('str')

In [374]:
 d20_df['player'] = d20_df['player'].str.replace(r'Thorn', r'Jasper', regex=True).astype('str')

In [375]:
 d20_df['player'] = d20_df['player'].str.replace(r'Andhera', r'Omar', regex=True).astype('str')

In [376]:
 d20_df['player'] = d20_df['player'].str.replace(r'BINX', r'Surena', regex=True).astype('str')

In [377]:
 d20_df['player'] = d20_df['player'].str.replace(r'Chirp', r'Emily', regex=True).astype('str')

In [378]:
 d20_df['player'] = d20_df['player'].str.replace(r'Major Hob', r'Brennan', regex=True).astype('str')

In [379]:
 d20_df['player'] = d20_df['player'].str.replace(r'Jeremy Renner', r'Aabria', regex=True).astype('str')

In [380]:
 d20_df['player'] = d20_df['player'].str.replace(r'Rue', r'Oscar', regex=True).astype('str')

In [381]:
 d20_df['player'] = d20_df['player'].str.replace(r'Apollo', r'Aabria', regex=True).astype('str')

In [382]:
 d20_df['player'] = d20_df['player'].str.replace(r'Squak', r'Lou', regex=True).astype('str')

In [383]:
 d20_df['player'] = d20_df['player'].str.replace(r'Mika', r'Aabria', regex=True).astype('str')

In [384]:
 d20_df['player'] = d20_df['player'].str.replace(r'NPC', r'Aabria', regex=True).astype('str')

In [385]:
 d20_df['player'] = d20_df['player'].str.replace(r'Fenneck', r'Aabria', regex=True).astype('str')

In [386]:
 d20_df['player'] = d20_df['player'].str.replace(r'Suntar', r'Aabria', regex=True).astype('str')

In [387]:
 d20_df['player'] = d20_df['player'].str.replace(r'Wuvvy', r'Aabria', regex=True).astype('str')

In [388]:
 d20_df['player'] = d20_df['player'].str.replace(r'Advisor', r'Aabria', regex=True).astype('str')

In [389]:
 d20_df['player'] = d20_df['player'].str.replace(r'Grandfather', r'Aabria', regex=True).astype('str')

In [390]:
 d20_df['player'] = d20_df['player'].str.replace(r'Gwyndolin', r'Aabria', regex=True).astype('str')

In [391]:
 d20_df['player'] = d20_df['player'].str.replace(r'Goblin King', r'Aabria', regex=True).astype('str')

In [392]:
 d20_df['player'] = d20_df['player'].str.replace(r'Bird', r'Aabria', regex=True).astype('str')

In [393]:
 d20_df['player'] = d20_df['player'].str.replace(r'Crowd', r'Aabria', regex=True).astype('str')

In [394]:
 d20_df['player'] = d20_df['player'].str.replace(r'Onlooker', r'Aabria', regex=True).astype('str')

In [395]:
 d20_df['player'] = d20_df['player'].str.replace(r'Messenger', r'Aabria', regex=True).astype('str')

In [396]:
 d20_df['player'] = d20_df['player'].str.replace(r'Wannessa', r'Aabria', regex=True).astype('str')

In [397]:
 d20_df['player'] = d20_df['player'].str.replace(r'Viola', r'Rashawn', regex=True).astype('str')

In [398]:
 d20_df['player'] = d20_df['player'].str.replace(r'Ephendra', r'Aabria', regex=True).astype('str')

In [399]:
 d20_df['player'] = d20_df['player'].str.replace(r'Beatrix', r'Aabria', regex=True).astype('str')

In [400]:
 d20_df['player'] = d20_df['player'].str.replace(r'Kobold', r'Aabria', regex=True).astype('str')

In [401]:
 d20_df['player'] = d20_df['player'].str.replace(r'Prue', r'Aabria', regex=True).astype('str')

In [402]:
 d20_df['player'] = d20_df['player'].str.replace(r'Meredith', r'Aabria', regex=True).astype('str')

In [403]:
 d20_df['player'] = d20_df['player'].str.replace(r'Injured Goblin', r'Aabria', regex=True).astype('str')

In [404]:
 d20_df['player'] = d20_df['player'].str.replace(r'Viscountess Grabalba', r'Aabria', regex=True).astype('str')

In [405]:
 d20_df['player'] = d20_df['player'].str.replace(r'Satisfied Person', r'Aabria', regex=True).astype('str')

In [406]:
 d20_df['player'] = d20_df['player'].str.replace(r'Capacea', r'Aabria', regex=True).astype('str')

In [407]:
 d20_df['player'] = d20_df['player'].str.replace(r'Seelie Court', r'Aabria', regex=True).astype('str')

In [408]:
 d20_df['player'] = d20_df['player'].str.replace(r'Blemish and Boil', r'Aabria', regex=True).astype('str')

In [409]:
 d20_df['player'] = d20_df['player'].str.replace(r'Blemish/Boil', r'Aabria', regex=True).astype('str')

In [410]:
 d20_df['player'] = d20_df['player'].str.replace(r'Blemish', r'Aabria', regex=True).astype('str')

In [411]:
 d20_df['player'] = d20_df['player'].str.replace(r'Guide', r'Aabria', regex=True).astype('str')

In [412]:
 d20_df['player'] = d20_df['player'].str.replace(r'Theodore', r'Aabria', regex=True).astype('str')

In [413]:
 d20_df['player'] = d20_df['player'].str.replace(r'Titania', r'Aabria', regex=True).astype('str')

In [414]:
 d20_df['player'] = d20_df['player'].str.replace(r'Hunter', r'Aabria', regex=True).astype('str')

In [415]:
 d20_df['player'] = d20_df['player'].str.replace(r'Sea Foam Court Member', r'Aabria', regex=True).astype('str')

In [416]:
 d20_df['player'] = d20_df['player'].str.replace(r'Gorebladder', r'Aabria', regex=True).astype('str')

In [417]:
 d20_df['player'] = d20_df['player'].str.replace(r'Spectators', r'Aabria', regex=True).astype('str')

In [418]:
 d20_df['player'] = d20_df['player'].str.replace(r'Grabalba', r'Aabria', regex=True).astype('str')

In [419]:
 d20_df['player'] = d20_df['player'].str.replace(r'Server', r'Aabria', regex=True).astype('str')

In [420]:
 d20_df['player'] = d20_df['player'].str.replace(r'Lady Boil', r'Aabria', regex=True).astype('str')

In [421]:
 d20_df['player'] = d20_df['player'].str.replace(r'Human', r'Aabria', regex=True).astype('str')

In [422]:
 d20_df['player'] = d20_df['player'].str.replace(r'Grandpa Dog', r'Aabria', regex=True).astype('str')

In [423]:
 d20_df['player'] = d20_df['player'].str.replace(r'Phoebe', r'Aabria', regex=True).astype('str')

In [424]:
 d20_df['player'] = d20_df['player'].str.replace(r'The Queen of Air and Darkness', r'Aabria', regex=True).astype('str')

In [425]:
 d20_df['player'] = d20_df['player'].str.replace(r'Ava', r'Erika', regex=True).astype('str')

In [426]:
 d20_df['player'] = d20_df['player'].str.replace(r'Lukas', r'Aabria', regex=True).astype('str')

In [427]:
 d20_df['player'] = d20_df['player'].str.replace(r'Sea Foam Member', r'Aabria', regex=True).astype('str')

In [428]:
 d20_df['player'] = d20_df['player'].str.replace(r'Aqrabus Student', r'Aabria', regex=True).astype('str')

In [429]:
 d20_df['player'] = d20_df['player'].str.replace(r'Scary Stoat', r'Aabria', regex=True).astype('str')

In [430]:
 d20_df['player'] = d20_df['player'].str.replace(r'Sam\'s Mom', r'Aabria', regex=True).astype('str')

In [431]:
 d20_df['player'] = d20_df['player'].str.replace(r'Mrs\. Nguyen', r'Aabria', regex=True).astype('str')

In [432]:
 d20_df['player'] = d20_df['player'].str.replace(r'Kelmp Fans', r'Aabria', regex=True).astype('str')

In [433]:
 d20_df['player'] = d20_df['player'].str.replace(r'Teammates', r'Aabria', regex=True).astype('str')

In [434]:
 d20_df['player'] = d20_df['player'].str.replace(r'Teammate', r'Aabria', regex=True).astype('str')

In [435]:
 d20_df['player'] = d20_df['player'].str.replace(r'Baroness Alven', r'Aabria', regex=True).astype('str')

In [436]:
 d20_df['player'] = d20_df['player'].str.replace(r'King Oberon', r'Aabria', regex=True).astype('str')

In [437]:
 d20_df['player'] = d20_df['player'].str.replace(r'Baroness', r'Aabria', regex=True).astype('str')

In [438]:
 d20_df['player'] = d20_df['player'].str.replace(r'Salt Goblin', r'Aabria', regex=True).astype('str')

In [439]:
 d20_df['player'] = d20_df['player'].str.replace(r'Alven', r'Aabria', regex=True).astype('str')

In [440]:
 d20_df['player'] = d20_df['player'].str.replace(r'Wave Master', r'Aabria', regex=True).astype('str')

In [441]:
 d20_df['player'] = d20_df['player'].str.replace(r'Lady Sylmenar', r'Aabria', regex=True).astype('str')

In [442]:
 d20_df['player'] = d20_df['player'].str.replace(r'Sea Foam Message', r'Aabria', regex=True).astype('str')

In [443]:
 d20_df['player'] = d20_df['player'].str.replace(r'Dr\. Wenabocker', r'Aabria', regex=True).astype('str')

In [444]:
 d20_df['player'] = d20_df['player'].str.replace(r'Random student', r'Aabria', regex=True).astype('str')

In [445]:
 d20_df['player'] = d20_df['player'].str.replace(r'(.*)and(.*)', r'\1&\2', regex=True).astype('str')

In [446]:
 d20_df['player'] = d20_df['player'].str.replace(r'(.*)And(.*)', r'\1&\2', regex=True).astype('str')

In [447]:
 d20_df['player'] = d20_df['player'].str.replace(r'(.*)&(.*)', r'\1&\2', regex=True).astype('str')

In [448]:
 d20_df['player'] = d20_df['player'].str.replace(r'Dove', r'Aabria', regex=True).astype('str')

In [449]:
 d20_df['player'] = d20_df['player'].str.replace(r'Hedge', r'Aabria', regex=True).astype('str')

In [450]:
 d20_df['player'] = d20_df['player'].str.replace(r'Gobble', r'Aabria', regex=True).astype('str')

In [451]:
 d20_df['player'] = d20_df['player'].str.replace(r'Scorple', r'Aabria', regex=True).astype('str')

In [452]:
 d20_df['player'] = d20_df['player'].str.replace(r'Sorwen', r'Aabria', regex=True).astype('str')

In [453]:
 d20_df['player'] = d20_df['player'].str.replace(r'Jarl', r'Aabria', regex=True).astype('str')

In [454]:
 d20_df['player'] = d20_df['player'].str.replace(r'Scratch', r'Aabria', regex=True).astype('str')

In [455]:
 d20_df['player'] = d20_df['player'].str.replace(r'Fable', r'Aabria', regex=True).astype('str')

In [456]:
 d20_df['player'] = d20_df['player'].str.replace(r'Fairy', r'Aabria', regex=True).astype('str')

In [457]:
 d20_df['player'] = d20_df['player'].str.replace(r'Caw', r'Aabria', regex=True).astype('str')

In [458]:
 d20_df['player'] = d20_df['player'].str.replace(r'Fey person', r'Aabria', regex=True).astype('str')

In [459]:
 d20_df['player'] = d20_df['player'].str.replace(r'Fey', r'Aabria', regex=True).astype('str')

In [460]:
 d20_df['player'] = d20_df['player'].str.replace(r'Olliver/Teedles', r'Aabria', regex=True).astype('str')

In [461]:
 d20_df['player'] = d20_df['player'].str.replace(r'Teedles', r'Aabria', regex=True).astype('str')

In [462]:
 d20_df['player'] = d20_df['player'].str.replace(r'Dr. Tara', r'Aabria', regex=True).astype('str')

In [463]:
 d20_df['player'] = d20_df['player'].str.replace(r'Walmer', r'Aabria', regex=True).astype('str')

In [464]:
 d20_df['player'] = d20_df['player'].str.replace(r'Olliver', r'Aabria', regex=True).astype('str')

In [465]:
 d20_df['player'] = d20_df['player'].str.replace(r'Stoat', r'Aabria', regex=True).astype('str')

In [466]:
 d20_df['player'] = d20_df['player'].str.replace(r'Wenabocker', r'Aabria', regex=True).astype('str')

In [467]:
 d20_df['player'] = d20_df['player'].str.replace(r'Dr. Steel', r'Aabria', regex=True).astype('str')

In [468]:
 d20_df['player'] = d20_df['player'].str.replace(r'Director', r'Aabria', regex=True).astype('str')

In [469]:
 d20_df['player'] = d20_df['player'].str.replace(r'Sam', r'Danielle', regex=True).astype('str')

In [470]:
 d20_df['player'] = d20_df['player'].str.replace(r'Robed Woman', r'Aabria', regex=True).astype('str')

In [471]:
 d20_df['player'] = d20_df['player'].str.replace(r'Freshman', r'Aabria', regex=True).astype('str')

In [472]:
 d20_df['player'] = d20_df['player'].str.replace(r'Fire Elemental', r'Aabria', regex=True).astype('str')

In [473]:
 d20_df['player'] = d20_df['player'].str.replace(r'Professor', r'Aabria', regex=True).astype('str')

In [474]:
 d20_df['player'] = d20_df['player'].str.replace(r'Scuppers Player', r'Aabria', regex=True).astype('str')

In [475]:
 d20_df['player'] = d20_df['player'].str.replace(r'Tallulah', r'Aabria', regex=True).astype('str')

In [476]:
 d20_df['player'] = d20_df['player'].str.replace(r'SU Fan', r'Aabria', regex=True).astype('str')

In [477]:
 d20_df['player'] = d20_df['player'].str.replace(r'Dream\’s Parents', r'Aabria', regex=True).astype('str')

In [478]:
 d20_df['player'] = d20_df['player'].str.replace(r'Dream', r'Erika', regex=True).astype('str')

In [479]:
 d20_df['player'] = d20_df['player'].str.replace(r'Digsby', r'Aabria', regex=True).astype('str')

In [480]:
 d20_df['player'] = d20_df['player'].str.replace(r'Q', r'Aabria', regex=True).astype('str')

In [481]:
 d20_df['player'] = d20_df['player'].str.replace(r'Dragon', r'Aabria', regex=True).astype('str')

In [482]:
 d20_df['player'] = d20_df['player'].str.replace(r'Chipmunk', r'Aabria', regex=True).astype('str')

In [483]:
 d20_df['player'] = d20_df['player'].str.replace(r'Voices', r'Aabria', regex=True).astype('str')

In [484]:
 d20_df['player'] = d20_df['player'].str.replace(r'Fourth Voice', r'Aabria', regex=True).astype('str')

In [485]:
 d20_df['player'] = d20_df['player'].str.replace(r'Scary Voice', r'Aabria', regex=True).astype('str')

In [486]:
 d20_df['player'] = d20_df['player'].str.replace(r'Voice', r'Aabria', regex=True).astype('str')

In [487]:
 d20_df['player'] = d20_df['player'].str.replace(r'Chimeron Student', r'Aabria', regex=True).astype('str')

In [488]:
 d20_df['player'] = d20_df['player'].str.replace(r'Mila', r'Aabria', regex=True).astype('str')

In [489]:
 d20_df['player'] = d20_df['player'].str.replace(r'Sybil', r'Aabria', regex=True).astype('str')

In [490]:
 d20_df['player'] = d20_df['player'].str.replace(r'Bennett', r'Aabria', regex=True).astype('str')

In [491]:
 d20_df['player'] = d20_df['player'].str.replace(r'Talia', r'Aabria', regex=True).astype('str')

In [492]:
 d20_df['player'] = d20_df['player'].str.replace(r'Ellen', r'Aabria', regex=True).astype('str')

In [493]:
 d20_df['player'] = d20_df['player'].str.replace(r'Woman', r'Aabria', regex=True).astype('str')

In [494]:
 d20_df['player'] = d20_df['player'].str.replace(r'Danielle’s Opponent', r'Aabria', regex=True).astype('str')

In [495]:
 d20_df['player'] = d20_df['player'].str.replace(r'Attendant', r'Aabria', regex=True).astype('str')

In [496]:
 d20_df['player'] = d20_df['player'].str.replace(r'First-year', r'Aabria', regex=True).astype('str')

In [497]:
 d20_df['player'] = d20_df['player'].str.replace(r'Nurse Stitchnit', r'Aabria', regex=True).astype('str')

In [498]:
 d20_df['player'] = d20_df['player'].str.replace(r'Nurse', r'Aabria', regex=True).astype('str')

In [499]:
 d20_df['player'] = d20_df['player'].str.replace(r'Stitchnit', r'Aabria', regex=True).astype('str')

In [500]:
 d20_df['player'] = d20_df['player'].str.replace(r'Passerby', r'Aabria', regex=True).astype('str')

In [501]:
 d20_df['player'] = d20_df['player'].str.replace(r'Second Head', r'Aabria', regex=True).astype('str')

In [502]:
 d20_df['player'] = d20_df['player'].str.replace(r'Kid', r'Aabria', regex=True).astype('str')

In [503]:
 d20_df['player'] = d20_df['player'].str.replace(r'Owl', r'Aabria', regex=True).astype('str')

In [504]:
 d20_df['player'] = d20_df['player'].str.replace(r'David', r'Aabria', regex=True).astype('str')

In [505]:
 d20_df['player'] = d20_df['player'].str.replace(r'Tad', r'Aabria', regex=True).astype('str')

In [506]:
 d20_df['player'] = d20_df['player'].str.replace(r'Messanteu kid', r'Aabria', regex=True).astype('str')

In [507]:
 d20_df['player'] = d20_df['player'].str.replace(r'Oggles', r'Aabria', regex=True).astype('str')

In [508]:
 d20_df['player'] = d20_df['player'].str.replace(r'Other Oggle', r'Aabria', regex=True).astype('str')

In [509]:
 d20_df['player'] = d20_df['player'].str.replace(r'Oggle', r'Aabria', regex=True).astype('str')

In [510]:
 d20_df['player'] = d20_df['player'].str.replace(r'Ponst Family', r'Aabria', regex=True).astype('str')

In [511]:
 d20_df['player'] = d20_df['player'].str.replace(r'Peddleston', r'Aabria', regex=True).astype('str')

In [512]:
 d20_df['player'] = d20_df['player'].str.replace(r'Alexis', r'Aabria', regex=True).astype('str')

In [513]:
 d20_df['player'] = d20_df['player'].str.replace(r'Rosemont student', r'Aabria', regex=True).astype('str')

In [514]:
 d20_df['player'] = d20_df['player'].str.replace(r'Model', r'Aabria', regex=True).astype('str')

In [515]:
 d20_df['player'] = d20_df['player'].str.replace(r'The Frunthwinkle', r'Aabria', regex=True).astype('str')

In [516]:
 d20_df['player'] = d20_df['player'].str.replace(r'Frunthwinkle', r'Aabria', regex=True).astype('str')

In [517]:
 d20_df['player'] = d20_df['player'].str.replace(r'Old Dwarf', r'Aabria', regex=True).astype('str')

In [518]:
 d20_df['player'] = d20_df['player'].str.replace(r'Mice', r'Aabria', regex=True).astype('str')

In [519]:
 d20_df['player'] = d20_df['player'].str.replace(r'Mouse', r'Aabria', regex=True).astype('str')

In [520]:
 d20_df['player'] = d20_df['player'].str.replace(r'Maddy', r'Aabria', regex=True).astype('str')

In [521]:
 d20_df['player'] = d20_df['player'].str.replace(r'Criggle', r'Aabria', regex=True).astype('str')

In [522]:
 d20_df['player'] = d20_df['player'].str.replace(r'Penfrew', r'Aabria', regex=True).astype('str')

In [523]:
 d20_df['player'] = d20_df['player'].str.replace(r'Axelby', r'Aabria', regex=True).astype('str')

In [524]:
 d20_df['player'] = d20_df['player'].str.replace(r'Mutton Chops', r'Aabria', regex=True).astype('str')

In [525]:
 d20_df['player'] = d20_df['player'].str.replace(r'Yorick', r'Aabria', regex=True).astype('str')

In [526]:
 d20_df['player'] = d20_df['player'].str.replace(r'Grasphlax', r'Aabria', regex=True).astype('str')

In [527]:
 d20_df['player'] = d20_df['player'].str.replace(r'Teddy', r'Aabria', regex=True).astype('str')

In [528]:
 d20_df['player'] = d20_df['player'].str.replace(r'Khanh', r'Aabria', regex=True).astype('str')

In [529]:
 d20_df['player'] = d20_df['player'].str.replace(r'Lemli', r'Aabria', regex=True).astype('str')

In [530]:
 d20_df['player'] = d20_df['player'].str.replace(r'Viniscus', r'Aabria', regex=True).astype('str')

In [531]:
 d20_df['player'] = d20_df['player'].str.replace(r'Cauldron', r'Aabria', regex=True).astype('str')

In [532]:
 d20_df['player'] = d20_df['player'].str.replace(r'Boodle', r'Aabria', regex=True).astype('str')

In [533]:
 d20_df['player'] = d20_df['player'].str.replace(r'Simon', r'Aabria', regex=True).astype('str')

In [534]:
 d20_df['player'] = d20_df['player'].str.replace(r'Chorus', r'Aabria', regex=True).astype('str')

In [535]:
 d20_df['player'] = d20_df['player'].str.replace(r'Kiran', r'Aabria', regex=True).astype('str')

In [536]:
 d20_df['player'] = d20_df['player'].str.replace(r'Carter', r'Aabria', regex=True).astype('str')

In [537]:
 d20_df['player'] = d20_df['player'].str.replace(r'Pep', r'Aabria', regex=True).astype('str')

In [538]:
 d20_df['player'] = d20_df['player'].str.replace(r'Silence', r'Aabria', regex=True).astype('str')

In [539]:
 d20_df['player'] = d20_df['player'].str.replace(r'Speaker', r'Aabria', regex=True).astype('str')

In [540]:
 d20_df['player'] = d20_df['player'].str.replace(r'Kiran', r'Aabria', regex=True).astype('str')

In [541]:
 d20_df['player'] = d20_df['player'].str.replace(r'\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0Aabria', r'Aabria', regex=True).astype('str')

In [542]:
 d20_df['player'] = d20_df['player'].str.replace(r'\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0Emily', r'Emily', regex=True).astype('str')

In [543]:
 d20_df['player'] = d20_df['player'].str.replace(r'\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0Omar', r'Omar', regex=True).astype('str')

In [544]:
 d20_df['player'] = d20_df['player'].str.replace(r'\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0Surena', r'Surena', regex=True).astype('str')

In [545]:
 d20_df['player'] = d20_df['player'].str.replace(r'\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0Lou', r'Lou', regex=True).astype('str')

In [546]:
 d20_df['player'] = d20_df['player'].str.replace(r'\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0Brennan', r'Brennan', regex=True).astype('str')

In [547]:
 d20_df['player'] = d20_df['player'].str.replace(r'\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0Oscar', r'Oscar', regex=True).astype('str')

In [548]:
 d20_df['player'] = d20_df['player'].str.replace(r'\xa0Aabria', r'Aabria', regex=True).astype('str')

In [549]:
 d20_df['player'] = d20_df['player'].str.replace(r'\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0Danielle', r'Danielle', regex=True).astype('str')

In [550]:
 d20_df['player'] = d20_df['player'].str.replace(r'\xa0Rashawn', r'Rashawn', regex=True).astype('str')

In [551]:
 d20_df['player'] = d20_df['player'].str.replace(r'Aabria\xa0', r'Aabria', regex=True).astype('str')

In [552]:
 d20_df['player'] = d20_df['player'].str.replace(r'\xa0Siobhan', r'Siobhan', regex=True).astype('str')

In [553]:
 d20_df['player'] = d20_df['player'].str.replace(r'Roger', r'Aabria', regex=True).astype('str')

In [554]:
 d20_df['player'] = d20_df['player'].str.replace(r'Janet', r'Aabria', regex=True).astype('str')

In [555]:
 d20_df['player'] = d20_df['player'].str.replace(r'Maurice', r'Aabria', regex=True).astype('str')

In [556]:
 d20_df['player'] = d20_df['player'].str.replace(r'Philtrum', r'Aabria', regex=True).astype('str')

In [557]:
 d20_df['player'] = d20_df['player'].str.replace(r'Fergus', r'Aabria', regex=True).astype('str')

In [558]:
 d20_df['player'] = d20_df['player'].str.replace(r'Freya', r'Aabria', regex=True).astype('str')

In [559]:
 d20_df['player'] = d20_df['player'].str.replace(r'Coach', r'Aabria', regex=True).astype('str')

In [560]:
 d20_df['player'] = d20_df['player'].str.replace(r'Student', r'Aabria', regex=True).astype('str')

In [561]:
 d20_df['player'] = d20_df['player'].str.replace(r'Children', r'Aabria', regex=True).astype('str')

In [562]:
 d20_df['player'] = d20_df['player'].str.replace(r'Child', r'Aabria', regex=True).astype('str')

In [563]:
 d20_df['player'] = d20_df['player'].str.replace(r'Coggle', r'Aabria', regex=True).astype('str')

In [564]:
 d20_df['player'] = d20_df['player'].str.replace(r'Hoggle', r'Aabria', regex=True).astype('str')

In [565]:
 d20_df['player'] = d20_df['player'].str.replace(r'Anamica', r'Aabria', regex=True).astype('str')

In [566]:
 d20_df['player'] = d20_df['player'].str.replace(r'Second-year', r'Aabria', regex=True).astype('str')

In [567]:
 d20_df['player'] = d20_df['player'].str.replace(r'Peter', r'Aabria', regex=True).astype('str')

In [568]:
 d20_df['player'] = d20_df['player'].str.replace(r'Lou\xa0', r'Lou', regex=True).astype('str')

In [569]:
d20_df['player'] = d20_df['player'].str.replace(r'\xa0Lou', r'Lou', regex=True).astype('str')

In [570]:
d20_df['player'] = d20_df['player'].str.replace(r'Lou& Emily', r'Lou & Emily', regex=True).astype('str')

In [571]:
 d20_df['player'] = d20_df['player'].str.replace(r'Aabria $', r'Aabria', regex=True).astype('str')

In [572]:
 d20_df['player'] = d20_df['player'].str.replace(r'Erika $', r'Erika', regex=True).astype('str')

In [573]:
 d20_df['player'] = d20_df['player'].str.replace(r'Brennan $', r'Brennan', regex=True).astype('str')

In [574]:
 d20_df['player'] = d20_df['player'].str.replace(r'Everyone', r'All', regex=True).astype('str')

And all of that work narrows down that big list of character names to a pretty reasonable handful of player names. Much more manageable AND a lot more useful. 

In [575]:
print(d20_df['player'].unique())

['Aabria' 'All' 'Emily' 'Lou' 'Omar' 'Oscar' 'Brennan' 'Surena'
 'Lou & Emily' 'Brennan & Lou' 'Lou & Oscar' 'Aabria & Oscar' 'Players'
 'Emily & Lou' 'Lou & Brennan' 'Emily & Oscar' 'Ebony' 'Denise' 'Rick'
 'Mario' 'Kevin' 'Oscar, Surena & Lou' 'Rashawn' 'Jasper' 'Siobhan' 'Izzy'
 'Erika' 'Brennan & Jasper' 'Cast' 'Aabria & Jasper' 'Rashawn/Izzy'
 'Jasper ' 'Izzy & Siobhan' 'Brennan & Siobhan' 'Siobhan & Izzy'
 'Jasper & Erika' 'Siobhan & Jasper' 'Aabria & Siobhan' 'Crew member'
 'Carlos' 'Molly' 'Group' 'Danielle' 'Both' 'Danielle & Lou'
 'Aabria & Brennan' 'Lou & Danielle' 'Danielle & Erika'
 'Aabria & Danielle' 'Someone behind the camera' 'Orion' 'Paula' 'Todd']


In [576]:
d20_df['player'].value_counts() #these new totals look good too!

Aabria                       27029
Brennan                      12211
Lou                           6723
Erika                         4814
Emily                         3604
Jasper                        3386
Oscar                         3108
Omar                          2848
Rashawn                       2367
Siobhan                       2357
Surena                        2318
Izzy                          2295
Danielle                      1512
Kevin                          283
Orion                          282
Denise                         164
Todd                           124
Ebony                          108
Rick                           107
All                            102
Paula                           97
Mario                           75
Lou & Brennan                    6
Carlos                           6
Lou & Emily                      5
Jasper & Erika                   5
Players                          5
Cast                             4
Brennan & Lou       

Let's confirm now that we don't have any more speech lines that have Name: left as an element of the line

And take a look at that player column! Looks great :)

In [577]:
d20_df[d20_df['variable'].str.contains(':')]

Unnamed: 0,index,variable,episode,season,nonspeech,inaudible_speech,name,player
2561,1270,"Cousin, this is 3:00 A.M. behavior!",1,ACOFF,,,Chirp,Emily
3203,30,"two, and it's time to begin, so let's begin wi...",2,ACOFF,,,Aabria,Aabria
4190,1017,is set to begin at 11:00 AM.,2,ACOFF,,,Aabria,Aabria
5219,2046,"And as you begin to talk, it was 11:00 AM for...",2,ACOFF,,,Aabria,Aabria
26050,153,And not like a rerun on Comedy Central at 11:...,3,ACOFF AP,,,Lou,Lou
32526,412,"up little flashes, and then you hear your moth...",1,Burrow's End,,,Aabria,Aabria
32532,418,"You edge a little closer, and you hear:",1,Burrow's End,,,Aabria,Aabria
32697,583,Let me be very clear: there's not clothes!,1,Burrow's End,[to camera],,Aabria,Aabria
33805,1691,tunnels. But the fun fact about the warren tha...,1,Burrow's End,,,Aabria,Aabria
35742,1524,It's 6:30.,2,Burrow's End,,,Izzy,Izzy


### Clean up some non-speech lines

There are some instances of lines that are descriptions of what is happening on the screen, in the room, or movement of the players. Since it's not speech or verbal information at all, I want to just ditch it. There isn't a simple way to search for all of these and be 100% sure I've captured every single one, but I'm going to do my best to find what I can. On the website, these texts are italicized. That information doesn't exist in this form of the data, but can be helpful in searching the public site for key phrases to look for, match up with, and eliminate.

In [578]:
d20_df[d20_df['variable'].str.contains('The Dome')] #"the dome" is the name of the room they record in

Unnamed: 0,index,variable,episode,season,nonspeech,inaudible_speech,name,player
41423,46,"The Dome darkens, revealing paper cutouts of f...",5,Burrow's End,,,Brennan,Brennan
44072,2695,The Dome’s background changes to show a paper ...,5,Burrow's End,,,Sybil,Aabria


In [579]:
d20_df[d20_df['variable'].str.contains('silhouette')]

Unnamed: 0,index,variable,episode,season,nonspeech,inaudible_speech,name,player
13899,725,silhouetted by a bunch of goblins just having ...,6,ACOFF,,,Aabria,Aabria
22742,1842,"and you see, off behind a pillar, the silhouet...",9,ACOFF,,,Aabria,Aabria
41437,60,"A silhouette of Last Bast appears, its door op...",5,Burrow's End,,,Aabria,Aabria


In [580]:
d20_df[d20_df['variable'].str.contains('winks')]

Unnamed: 0,index,variable,episode,season,nonspeech,inaudible_speech,name,player
22239,1339,we probably went to sit with Suntar and Apollo...,9,ACOFF,,,Lou,Lou
41576,199,Izzy and Brennan trade winks across the table.,5,Burrow's End,,,Tula,Brennan
46296,1909,Brennan winks at Rashawn.,6,Burrow's End,,,Sybil,Aabria
64694,2135,Fergus is craning his neck to catch Sam’s eye....,3,Misfits and Magic,,,Aabria,Aabria


In [581]:
d20_df[d20_df['variable'].str.contains('expression')]

Unnamed: 0,index,variable,episode,season,nonspeech,inaudible_speech,name,player
1738,444,translucent-at-the-edge figures wearing masks ...,1,ACOFF,,,Aabria,Aabria
7643,1642,"doe-eyed, confused, excited expression you've ...",3,ACOFF,,,Aabria,Aabria
8459,2458,"And for just a moment, her expression softens...",3,ACOFF,,,Aabria,Aabria
26982,416,expressions that you make as a Wuvvy.,4,ACOFF AP,,,Emily,Emily
32346,232,anguished expression and begins quietly hyperv...,1,Burrow's End,,,Beatrix,Aabria
33774,1660,"Rashawn, with an alarmed expression, begins fa...",1,Burrow's End,,,Thorn,Jasper
41839,462,"Erika makes a bewildered, mistrustful expression.",5,Burrow's End,,,Talia,Aabria
44295,2918,"Brennan stays silent, but looks at Izzy with a...",5,Burrow's End,,,Siobhan,Siobhan
46029,1642,Erika mimes energetically licking Jasper with ...,6,Burrow's End,,,Aabria,Aabria
50067,442,You see his expression does soften towards you.,8,Burrow's End,,,Aabria,Aabria


In [583]:
delete = [

42048,
41953,  
41891,
38996,
37177,
37087,
36938,
34023,
32828,
32392,
32387,
32379,
28265,
21959,
16626,
7145,
39045,
48016,
49003,
49604,
58759,
33774,
41839,
44295,
46029,
41423,
44072,
44073,
44074,
41437,
41438,
41576,
46296,
37603,
37784,   
37857,  
38441, 
38982,
42219,
42231,
42258,
42480,
42667,
42725,
43220,
43333,
43701,   
44479,  
44667, 
46638,
46994
]

In [584]:
d20_df = d20_df.drop(delete)

In [585]:
d20_df[d20_df['variable'].str.contains('mimes')]

Unnamed: 0,index,variable,episode,season,nonspeech,inaudible_speech,name,player
47140,2753,Brennan mimes destroying the weapon.,6,Burrow's End,,,Tula,Brennan
47658,3271,Jasper leans back and mimes clutching Jaysohn ...,6,Burrow's End,,,Jasper,Jasper
48059,289,A flash of Blue fills the Dome as Brennan mime...,7,Burrow's End,,,Brennan,Brennan
48370,600,Aabria mimes a huge paw reaching through a doo...,7,Burrow's End,,,Director,Aabria
48513,743,Aabria mimes a deep backwards throw. Jasper gr...,7,Burrow's End,,,Aabria,Aabria
48744,974,"She mimes riding it like a bronco, Jasper guff...",7,Burrow's End,,,Rashawn,Rashawn
49647,22,"Brennan sings the tune to Chariots of Fire, an...",8,Burrow's End,,,Rashawn,Rashawn
49650,25,Siobhan mimes cymbals crashing.,8,Burrow's End,,,Aabria,Aabria
49657,32,Rashawn mimes crossing a finish line and falli...,8,Burrow's End,,,Siobhan,Siobhan
49738,113,"Rashawn mimes pulling levers, Erika bounces li...",8,Burrow's End,,,Rashawn,Rashawn


In [None]:
d20_df.variable[41839]

camera
gestures