In [1]:
import pandas as pd
import re
import string
from spacy.lang.en import English

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# read in non-pdf maxfun dataframe
%store -r maxfun_df

In [2]:
# will need to match pdf dataframe columns:  Podcast, Episode, Title, Year, Text
maxfun_df.head()

Unnamed: 0,Episode,Title,Content
0,TRANSCRIPT The Flop House Ep. 329,King of the Monsters,"[﻿0:00:00, dan, On this episode of , The Flop ..."
1,TRANSCRIPT Flop House Ep. 330,"Freaky, with Barbara Crampton","[﻿0:00:00, dan, On this episode of , The Flop ..."
2,TRANSCRIPT Bullseye with Jesse Thorn,Stephen Malkmus on the song that changed his ...,"[﻿0:00:00, jesse thorn, It’s , Bullseye, . I’m..."
3,TRANSCRIPT Switchblade Sisters Ep. 150,‘Thelma & Louise’ with ‘Unpregnant’ Director ...,"[﻿00:00:00, music, “Switchblade Comb” by Mobiu..."
4,TRANSCRIPT Judge John Hodgman Ep. 484,Fun With Words and Lexicographer Emily Brewster,"[﻿0:00:00, sound effect, [Three gavel bangs.],..."


In [3]:
# separate episode number and podcast name
def epcleaner(episode):
    eplist = []
    
    episode = episode.lower()
    if 'transcript' in episode:
        episode = episode.replace('transcript','')
    
    epnum = r'\d+'
    epnumber = re.findall(epnum, episode)
    if len(epnumber) < 1:
        epnumber.append('')
    episode = re.sub(epnum, '', episode)
    
    f = r'\b[Ee]p(\.|isode){0,1}'
    episode = re.sub(f, '', episode)
    
    eplist.append(episode.strip())
    eplist.append(epnumber[0])
    
    return eplist
    

# returns list as [podcast, episode#]
print(maxfun_df.Episode[10], '>>>', epcleaner(maxfun_df.Episode[10]))    

TRANSCRIPT Bullseye with Jesse Thorn >>> ['bullseye with jesse thorn', '']


In [4]:
# map the episode cleaner function to podcast
maxfun_df['Podcast'] = maxfun_df.Episode.map(lambda e: epcleaner(e)[0])
maxfun_df.Podcast.value_counts()

bullseye with jesse thorn                                    63
one bad mother                                               29
judge john hodgman                                           28
the flop house                                               16
switchblade sisters                                          14
award-winning playwright katori hall on starz’ “p-valley”     1
flop house                                                    1
laura jane grace on going solo, against me!, and more         1
one bad mother bonus                                          1
Name: Podcast, dtype: int64

In [5]:
# map the episode cleaner function to episodes
maxfun_df['Episode'] = maxfun_df.Episode.map(lambda e: epcleaner(e)[1])
maxfun_df.head()

Unnamed: 0,Episode,Title,Content,Podcast
0,329.0,King of the Monsters,"[﻿0:00:00, dan, On this episode of , The Flop ...",the flop house
1,330.0,"Freaky, with Barbara Crampton","[﻿0:00:00, dan, On this episode of , The Flop ...",flop house
2,,Stephen Malkmus on the song that changed his ...,"[﻿0:00:00, jesse thorn, It’s , Bullseye, . I’m...",bullseye with jesse thorn
3,150.0,‘Thelma & Louise’ with ‘Unpregnant’ Director ...,"[﻿00:00:00, music, “Switchblade Comb” by Mobiu...",switchblade sisters
4,484.0,Fun With Words and Lexicographer Emily Brewster,"[﻿0:00:00, sound effect, [Three gavel bangs.],...",judge john hodgman


In [6]:
# sort columns
podcast = maxfun_df.pop('Podcast')
episodes = maxfun_df.pop('Episode')
year = pd.Series(dtype=str)
title = maxfun_df.pop('Title')
text = maxfun_df.pop('Content')

maxfun_df['Podcast'] = podcast
maxfun_df['Episode'] = episodes
maxfun_df['Title'] = title
maxfun_df['Year'] = year
maxfun_df['Text'] = text

maxfun_df.head()

Unnamed: 0,Podcast,Episode,Title,Year,Text
0,the flop house,329.0,King of the Monsters,,"[﻿0:00:00, dan, On this episode of , The Flop ..."
1,flop house,330.0,"Freaky, with Barbara Crampton",,"[﻿0:00:00, dan, On this episode of , The Flop ..."
2,bullseye with jesse thorn,,Stephen Malkmus on the song that changed his ...,,"[﻿0:00:00, jesse thorn, It’s , Bullseye, . I’m..."
3,switchblade sisters,150.0,‘Thelma & Louise’ with ‘Unpregnant’ Director ...,,"[﻿00:00:00, music, “Switchblade Comb” by Mobiu..."
4,judge john hodgman,484.0,Fun With Words and Lexicographer Emily Brewster,,"[﻿0:00:00, sound effect, [Three gavel bangs.],..."


In [7]:
maxfun_df = maxfun_df.sort_values(by=['Podcast'])
maxfun_df

Unnamed: 0,Podcast,Episode,Title,Year,Text
150,award-winning playwright katori hall on starz’...,,TRANSCRIPT Award-winning playwright Katori Hal...,,"[﻿0:00:00, music, Gentle, trilling music with ..."
76,bullseye with jesse thorn,,Cartoonist and Author Adrian Tomine,,"[﻿0:00:00, jesse thorn, Hey all, it’s Jesse. A..."
53,bullseye with jesse thorn,,Actor Richard Jenkins,,"[﻿0:00:00, music, Gentle, trilling music with ..."
100,bullseye with jesse thorn,,Fran Lebowitz,,"[﻿0:00:00, music, Gentle, trilling music with ..."
101,bullseye with jesse thorn,,John Wilson,,"[﻿0:00:00, music, Gentle, trilling music with ..."
...,...,...,...,...,...
15,the flop house,319,Battle Angel LIVE,,"[﻿0:00:00, music, Light, up-tempo, electric gu..."
124,the flop house,315,Hawk the Slayer,,"[﻿0:00:00, dan mccoy, On this episode we discu..."
23,the flop house,320,Last Christmas,,"[﻿0:00:00, dan, On this episode, we discuss—, ..."
47,the flop house,324,Hellboy LIVE,,"[﻿0:00:00, dan mccoy, On this episode of , The..."


In [8]:
# How does the text look?
for t in maxfun_df.sample(5).Text:
    print(t[:10])

['\ufeff0:00:00', 'music', '“Switchblade Comb” by Mobius VanChocStraw. A jaunty, jazzy tune reminiscent of the opening theme of a movie. Music continues at a lower volume as April introduces herself and her guest, and then it fades out.', '0:00:08', 'april wolfe', 'Welcome to ', 'Switchblade Sisters', ', where women get together to slice and dice our favorite action and genre films. I’m April Wolfe. Every week, I invite a new female filmmaker on. A writer, director, actor, or producer, and we talk—in depth—about one of their fave genre films. Perhaps one that’s influenced their own work. And again, you may already know, a reminder we are remote recording now, and I’m recording in my bedroom. Again, Chicken may scream, the leaf blowers might come out. I’m in Los Angeles, it’s just part of the milieu. The audio is likely going to sound a little bit different from our studios, but everything else is the same, except for also our guest is different. Today, I’m very excited to have writer-d

In [9]:
maxfun_df = maxfun_df.reset_index(drop=True)
maxfun_df.head()
maxfun_df.sample(10)

Unnamed: 0,Podcast,Episode,Title,Year,Text
0,award-winning playwright katori hall on starz’...,,TRANSCRIPT Award-winning playwright Katori Hal...,,"[﻿0:00:00, music, Gentle, trilling music with ..."
1,bullseye with jesse thorn,,Cartoonist and Author Adrian Tomine,,"[﻿0:00:00, jesse thorn, Hey all, it’s Jesse. A..."
2,bullseye with jesse thorn,,Actor Richard Jenkins,,"[﻿0:00:00, music, Gentle, trilling music with ..."
3,bullseye with jesse thorn,,Fran Lebowitz,,"[﻿0:00:00, music, Gentle, trilling music with ..."
4,bullseye with jesse thorn,,John Wilson,,"[﻿0:00:00, music, Gentle, trilling music with ..."


Unnamed: 0,Podcast,Episode,Title,Year,Text
122,one bad mother,374.0,What Will It Take to Slay the Motherhood Myth...,,"[﻿0:00:00, biz ellis, Hi. I’m Biz., 0:00:02, t..."
124,switchblade sisters,147.0,"‘Truly, Madly, Deeply’ with ‘Model Minority’ ...",,"[﻿0:00:00, music, “Switchblade Comb” by Mobius..."
135,switchblade sisters,138.0,‘Drive’ with ‘Stray Dolls’ Director Sonejuhi ...,,"[﻿0:00:00, music, “Switchblade Comb” by Mobius..."
69,judge john hodgman,476.0,Vampirical Evidence,,"[﻿0:00:00, sound effect, [Three gavel bangs.],..."
26,bullseye with jesse thorn,,"Robert Glasper, Grammy-winning R&B artist",,"[﻿0:00:00, music, Gentle, trilling music with ..."
61,bullseye with jesse thorn,,Kyle MacLachlan,,"[﻿0:00:00, music, Gentle, trilling music with ..."
20,bullseye with jesse thorn,,David Letterman,,"[﻿0:00:00, music, Gentle, trilling music with ..."
53,bullseye with jesse thorn,,The Isley Brothers’ Ernie Isley,,"[﻿0:00:00, music, Gentle, trilling music with ..."
102,one bad mother,364.0,"I Only Have One Child, So Easy Peasy! With Li...",,"[﻿Biz Ellis, host, Hi. I’m Biz., Theresa Thorn..."
106,one bad mother,365.0,The Second Ever Pandemic Genius Spectacular P...,,"[﻿0:00:00, biz ellis, Hi. I’m Biz., 0:00:02, t..."


In [10]:
maxfun_df.Podcast.value_counts()

bullseye with jesse thorn                                    63
one bad mother                                               29
judge john hodgman                                           28
the flop house                                               16
switchblade sisters                                          14
laura jane grace on going solo, against me!, and more         1
award-winning playwright katori hall on starz’ “p-valley”     1
flop house                                                    1
one bad mother bonus                                          1
Name: Podcast, dtype: int64

In [11]:
# sort speakers
def parse_speech(text, ignored_strings):

    hosts = set([t.strip() for t in text if t[0].islower() and t != 'crosstalk'])
    text = '  '.join([t.lower().strip() for t in text])
    for i in ignored_strings:
        text = re.sub(i, '', text)
    text = re.sub(r'\\', '', text)
    text = re.sub(r'\[.*?]', '', text)

        
    speech_dict = {}
    for h in hosts:
        speech = ''
        regex = '(?<=\d:\d\d:\d\d) *' + h + '.*?(?=\d:\d\d:\d\d)'
        speech = re.findall(regex, text)
        speech = [(re.sub(h, '', s)).strip() for s in speech]
        speech_dict[h] = speech
    
    text = re.sub(r'\d:\d\d:\d\d', '', text)

    for h in hosts:
        text = re.sub(h, 'SPEAKER', text)
    

    
    return text, speech_dict

In [12]:
maxfun_df.Text[25]

['\ufeff0:00:00',
 'music',
 'Gentle, trilling music with a steady drumbeat plays under the dialogue.',
 '0:00:01',
 'promo',
 'Speaker',
 ': ',
 'Bullseye with Jesse Thorn',
 ' is a production of ',
 'MaximumFun.org',
 ' and is distributed by NPR. ',
 '[Music fades out.]',
 '0:00:12',
 'music',
 '“Huddle Formation” from the album ',
 'Thunder, Lightning, Strike',
 ' by The Go! Team.',
 '0:00:19',
 'jesse thorn',
 'It’s ',
 'Bullseye',
 '. I’m Jesse Thorn. Carrie Coon, the actor, is said to exude a Midwestern pragmatism. At least, that’s how one ',
 'New Yorker',
 ' article described her. I guess you can say it’s in the way she carries herself. As a performer, she’s confident—never timid. Usually warm, not always. You can see it in some of her most iconic roles. As Nora, in ',
 'The Leftovers',
 ', she’s angry and kind of tightly wound, traumatized by the loss of her family. On season three of ',
 'Fargo',
 ', where she played Police Chief Gloria Burgle: brave in the face of danger but

In [13]:
def clean_text(text):
    text = [t for t in text if re.search('[a-zA-Z]', t) != None]
    text = ' '.join(text)
    return text

clean_text(maxfun_df.Text[25])
    

"music Gentle, trilling music with a steady drumbeat plays under the dialogue. promo Speaker Bullseye with Jesse Thorn  is a production of  MaximumFun.org  and is distributed by NPR.  [Music fades out.] music “Huddle Formation” from the album  Thunder, Lightning, Strike  by The Go! Team. jesse thorn It’s  Bullseye . I’m Jesse Thorn. Carrie Coon, the actor, is said to exude a Midwestern pragmatism. At least, that’s how one  New Yorker  article described her. I guess you can say it’s in the way she carries herself. As a performer, she’s confident—never timid. Usually warm, not always. You can see it in some of her most iconic roles. As Nora, in  The Leftovers , she’s angry and kind of tightly wound, traumatized by the loss of her family. On season three of  Fargo , where she played Police Chief Gloria Burgle: brave in the face of danger but also baffled at humanity’s capacity to be so violent and cruel. She’s starring in the new film  The Nest The Nest  was directed by Sean Durkin, who p

In [14]:
current_text = maxfun_df.Text[11]

In [15]:
maxfun_df['Anonymized_text'] = maxfun_df.Text.map(lambda t: parse_speech(t, ['max','maximum','fun','comedy'])[0])
maxfun_df['Speaker_parts'] = maxfun_df.Text.map(lambda t: parse_speech(t, ['max','maximum','fun','comdedy'])[1])
maxfun_df['Text'] = maxfun_df.Text.map(clean_text)

In [16]:
bullseye = maxfun_df.groupby('Podcast').get_group('bullseye with jesse thorn').sort_values(by=['Episode']).reset_index(drop=True)
bullseye.sample(3)
%store bullseye

Unnamed: 0,Podcast,Episode,Title,Year,Text,Anonymized_text,Speaker_parts
61,bullseye with jesse thorn,,"Ma Rainey’s Black Bottom, Marrying Aretha Fra...",,"music Gentle, trilling music with a steady dru...","﻿ SPEAKER gentle, trilling SPEAKER with a st...",{'glynn turman': ['thank you. thank you for ha...
45,bullseye with jesse thorn,,Catherine O’Hara,,"music Gentle, trilling music with a steady dru...","﻿ SPEAKER gentle, trilling SPEAKER with a st...","{'like A Mighty Wind': [], 'catherine o’hara':..."
25,bullseye with jesse thorn,,Author Jeff VanderMeer,,"music Gentle, trilling music with a steady dru...","﻿ SPEAKER gentle, trilling SPEAKER with a st...",{'jeff': ['vandermeer thanks so much for havi...


Stored 'bullseye' (DataFrame)


In [17]:
mother = maxfun_df.groupby('Podcast').get_group('one bad mother').sort_values(by=['Episode']).reset_index(drop=True)
mother.sample(3)
mother.Text[0]
%store mother

Unnamed: 0,Podcast,Episode,Title,Year,Text,Anonymized_text,Speaker_parts
9,one bad mother,367,"Plants Will Help, with Jade from Black Plant ...",,biz Hi. I’m Biz. theresa And I’m Theresa. biz ...,﻿ SPEAKER hi. i’m SPEAKER. SPEAKER and i...,"{'ficus elastica': [], 'caller': ['thanks for ..."
6,one bad mother,364,"I Only Have One Child, So Easy Peasy! With Li...",,﻿Biz Ellis host Hi. I’m Biz. Theresa Thorn hos...,﻿biz ellis SPEAKER hi. i’m biz. theresa tho...,"{'caller': [], 'guest': [], 'host': [], 'clip'..."
10,one bad mother,368,Is Mind Reading a Symptom of the Coronavirus?,,biz Hi. I’m Biz. theresa And I’m Theresa. biz ...,﻿ SPEAKER hi. i’m SPEAKER. SPEAKER and i...,{'caller': ['so i’m doing… i’m okay. i’m getti...


"biz ellis Hi. I’m Biz. theresa thorn And I’m Theresa. biz Due to the pandemic, we bring you  One Bad Mother  straight from our homes—including such interruptions as: children! Animal noises! And more! So let’s all get a little closer while we have to be so far apart. And remember—we are doing a good job. music “Summoning the Rawk” by Kevin MacLeod. Driving electric guitar and heavy drums.  [Continues through dialogue.] biz This week on  One Bad Mother —a band-aid ain’t gonna fix this! We talk about parents being the front line against racism. Plus, Biz takes a tone and we welcome James Arthur, the host of the podcast  Minority Korner [Biz and James repeatedly affirm each other as they discuss their respective weeks.] crosstalk Biz and James Arthur : Wooooo!  [Both laugh.] james arthur We hit all the octaves!  [Laughs.] biz We did! That’s—I’m not sure I’ve ever woo’d with a man. crosstalk James : Oh! Okay! Ahh. Gotcha.  Biz : Besides my husband! Besides my husband.  [Laughs.] james Wel

Stored 'mother' (DataFrame)


In [18]:
hodgman = maxfun_df.groupby('Podcast').get_group('judge john hodgman').sort_values(by=['Episode']).reset_index(drop=True)
hodgman.sample(3)
hodgman.Text[0]
%store hodgman

Unnamed: 0,Podcast,Episode,Title,Year,Text,Anonymized_text,Speaker_parts
5,judge john hodgman,475,You Can’t Acquit With Us,,sound effect [Three gavel bangs.] jesse thorn ...,﻿ SPEAKER SPEAKER welcome to the judge...,"{'speaker 2': ['comedy and culture.'], 'on Fac..."
23,judge john hodgman,493,Mr. Clicky Keys,,sound effect [Three gavel bangs.] jesse thorn ...,﻿ SPEAKER SPEAKER welcome to the judge...,{'can be heard faintly in the background.]': [...
9,judge john hodgman,479,The Shears Club,,sound effect [Three gavel bangs.] jesse thorn ...,﻿ SPEAKER SPEAKER welcome to the judge...,"{'speaker 2': ['comedy and culture.'], 'joel':..."


'sound effect [Three gavel bangs.] monte belmonte Welcome to the  Judge John Hodgman  podcast. I\'m summertime... less fun-time guest bailiff, Monte Belmonte, filling in for the actual bailiff, Jesse Thorn. We are in chambers this week to clear the docket! And now, the Webby Award–winning host of the  Judge John Hodgman  podcast, Judge John Hodgman. What a thrill it was, Judge John Hodgman, when you included me in that Tweet!  [John chuckles.]  To tell me that this podcast had won a Webby! And then I go online and watch the Zoom Webby Awards, and there is none other than Monica Lewinsky giving you the award. john hodgman Yeah. monte It was really—it was fantastic. john Yeah, Monte, thank you. That—that I did not predict. I did not know that Monica Lewinsky would be... virtually presenting me with the Webby Award. But! The name of the show is  Judge John Hodgman , but as you know, Monte, there are a lot of people on Team  Judge John Hodgman . Obviously Bailiff Jesse Thorn, who can\'t be

Stored 'hodgman' (DataFrame)


In [19]:
flophouse = maxfun_df.groupby('Podcast').get_group('the flop house').sort_values(by=['Episode']).reset_index(drop=True)
flophouse.sample(3)
flophouse.Text[0]
%store flophouse

Unnamed: 0,Podcast,Episode,Title,Year,Text,Anonymized_text,Speaker_parts
8,the flop house,323,Deadly Lessons,,dan On this episode we discuss— Deadly Lessons...,﻿ SPEAKER on this episode we discuss— deadl...,"{'stuart': ['and the lesson is, watch this mov..."
12,the flop house,327,International,,"dan mccoy On this episode, we discuss Men in ...","﻿0 SPEAKER mccoy on this episode, we discuss...",{'stuart': ['wellington brought to you by the...
1,the flop house,316,Between Worlds,,dan mccoy On this episode we discuss: Between...,﻿ SPEAKER mccoy on this episode we discuss: ...,{'stuart': ['wellington the #1 search result ...


'dan mccoy On this episode we discuss:  Hawk the Slayer elliott kalan The movie that dares to ask the question—what if you watched your friends play D&D for an hour and a half, but you didn’t get to share their Doritos?  [Someone laughs quietly.] music Light, up-tempo, electric guitar with synth instruments. dan Hey, everyone, and welcome to  The Flop House ! I’m Dan McCoy. stuart Hey, Dan McCoy! It’s me! Stuart Wellington! elliott Over here is Elliott Kalan. Usually I’d waste a lot of time doing some kind of bit where I introduce my name and it takes a while? But we don’t have time for that! Because! I wanted to introduce our special guest star for this episode—an actual, honest-to-goodness, television superstar—that’s right—  [Dan laughs.]  Superstar, I said! You may know her as the creator and star of  The Guild . You may know her as a best-selling author! You may know her—like, you do know her—as Kinga Forrester on  Mystery Science Theater 3000 . The return for Netflix. Very excite

Stored 'flophouse' (DataFrame)


In [20]:
switchblade = maxfun_df.groupby('Podcast').get_group('switchblade sisters').sort_values(by=['Episode']).reset_index(drop=True)
switchblade.sample(3)
switchblade.Text[0]
%store switchblade

Unnamed: 0,Podcast,Episode,Title,Year,Text,Anonymized_text,Speaker_parts
9,switchblade sisters,145,‘The Company of Wolves’ with Actor and ‘Preve...,,music “Switchblade Comb” by Mobius VanChocStra...,﻿ SPEAKER “switchblade comb” by mobius vanch...,{'april wolfe': ['welcome to switchblade sist...
6,switchblade sisters,142,‘Possession’ with ‘The Rental’ and ‘A Girl Wa...,,"april wolfe Hey, this is April Wolfe, host of ...","﻿ SPEAKER hey, this is SPEAKER, host of swi...","{'casey o’brien': ['april? april, is that you?..."
11,switchblade sisters,148,‘The Faculty’ with ‘Coin Heist’ Director Emil...,,music “Switchblade Comb” by Mobius VanChocStra...,﻿ SPEAKER “switchblade comb” by mobius vanch...,"{'emily': ['hagins hi, april, thank you for h..."


'music “Switchblade Comb” by Mobius VanChocStraw. A jaunty, jazzy tune reminiscent of the opening theme of a movie. Music continues at a lower volume as April introduces herself and her guest, and then it fades out. april wolfe Welcome to  Switchblade Sisters , where women get together to slice and dice our favorite action and genre films. I’m April Wolfe. Every week, I invite a new female filmmaker on. A writer, director, actor, or producer, and we talk—in depth—about one of their fave genre films. Perhaps one that’s influenced their own work in some small way. And you may already know, but a reminder that we are remote recording now, and I am in my bedroom. Chicken is asleep, so she’s probably not gonna start screeching at us. You might hear some birds. The audio is likely going to sound a little different from our studio’s, but everything else is the same. Except for today, our great guest, who we’ve got—for some reason we keep getting all these wonderful people out of Australia. Di

Stored 'switchblade' (DataFrame)
