# All podcasts

### Load elements and build DataFrame

In [1]:
import pandas as pd
import re
import nltk
import spacy
from spacy.lang.en import English
import collections
from collections import Counter
from itertools import chain
import statistics
import math

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
# initialize spacy objects
nlp = spacy.load('en_core_web_md')

In [3]:
%store -r Nightvale_df
%store -r myDNA_df
%store -r YWA_df
%store -r uu_df
%store -r radiolab_df
%store -r tal_df
%store -r bullseye
%store -r mother
%store -r hodgman
%store -r flophouse
%store -r switchblade
%store -r mbmbam
%store -r sawbones
%store -r wonderful
%store -r tgg
%store -r ffire
%store -r shmanners
%store -r taz
%store -r neoscum_df


# %store -r freak_df
# %store -r Lore_df
# %store -r Invisible_df
# %store -r OnBeing_df
# %store -r StoryCorps_df

In [4]:
radiolab_df.head(3)

Unnamed: 0,Title,Year,Text
0,Bit Flip,2019,[ADVERTISEMENT] [RADIOLAB INTRO] SIMON ...
1,Dispatches from 1918,2020,[RADIOLAB INTRO] PAT WALTERS: Jad? JAD ...
2,The Beauty Puzzle,2019,Radiolab: The Beauty Puzzle CHAPTER 1 (D...


In [5]:
neoscum_df.head(3)

Unnamed: 0,Episode,Year,Title,Text
0,1,,Darkmovers,Mike Migdall (MM): Hey everybody and welcome t...
1,2,,Ctrl Actions,Gannon Reedy (GR): We’re in it. We’re in the g...
2,3,,Lil Marco,Gannon Reedy (GR): Exterior. Hungry Hound Dine...


In [6]:
# create a multindexed dataframe
data = pd.concat([Nightvale_df.reset_index(drop=True),
                  myDNA_df.reset_index(drop=True),
                  YWA_df.reset_index(drop=True),                  
                  uu_df.reset_index(drop=True),
                  radiolab_df.reset_index(drop=True),
                  tal_df.reset_index(drop=True),
                  bullseye,
                  mother,
                  hodgman,
                  flophouse,
                  switchblade,
                  mbmbam,
                  sawbones,
                  wonderful,
                  tgg,
                  ffire,
                  shmanners,
                  taz,
                  neoscum_df],
                  # StoryCorps_df, OnBeing_df, Invisible_df, Lore_df],
                  keys = ['Welcome to Nightvale','Move Your DNA','You\'re Wrong About','Unlocking Us',
                         'Radiolab','This American Life', 'Bullseye with Jesse Thorn','One Bad Mother',
                         'Judge John Hodgman','The Flophouse','Switchblade Sisters',
                         'MBMBaM','Sawbones','Wonderful','The Greatest Generation','Friendly Fire','Shmanners',
                          'The Adventure Zone','NeoScum'], names=['podcast','#']).reset_index(level=1)

In [7]:
data.sample(10)

Unnamed: 0_level_0,#,Episode,Year,Title,Text,Podcast
podcast,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
MBMBaM,22,536.0,2020.0,Ratashootie,MBMBaM 536: Ratashootie Published on November ...,MBMBaM
NeoScum,9,10.0,,"Lights, Camera, Action",Gannon Reedy (GR): Just a little reminder. You...,
This American Life,382,383.0,,Origin Story,Prologue Ira Glass Pino Audia teach...,
Move Your DNA,51,63.0,,Movement Matters #2: Creating a Personal Missi...,KATY: It is. And that's the nature of the sh...,
This American Life,519,522.0,,Tarred and Feathered,"Prologue Ira Glass Hey, everybody, ...",
This American Life,171,172.0,,24 Hours at the Golden Apple,Act One: Day Ira Glass There are c...,
Radiolab,49,,2017.0,Shots Fired: Part 2,,
This American Life,521,524.0,,I Was So High,Prologue Ira Glass One of the produ...,
This American Life,557,560.0,,Abdi and the Golden Ticket,Prologue: Prologue Ira Glass Every ...,
Radiolab,130,,2020.0,Baby Blue Blood Drive,"Speaker 1: Wait, wait, you're li- (laughs) ...",


In [8]:
data = data.drop(columns=['#', 'Podcast'])
data.sample(20)

Unnamed: 0_level_0,Episode,Year,Title,Text
podcast,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Move Your DNA,96.0,,Birth and the Shapeshifting Pelvis,and a bunch of other books about movement. Th...
Move Your DNA,95.0,,The Maple Syrup Workout,Move Your DNA and a bunch of other books abo...
Radiolab,,2018.0,War of the Worlds,"JAD ABUMRAD: Hey, I’m Jad Abumrad. ROBERT KR..."
NeoScum,5.0,,We Are NeoScum,Gannon Reedy (GR): Zoom in super fast. Super f...
Bullseye with Jesse Thorn,,,Singer-Songwriter Ani DiFranco,"jesse thorn Hey, gang. It’s Jesse. We’re ge..."
This American Life,464.0,,Invisible Made Visible,Prologue Ira Glass Ryan started go...
Radiolab,,2015.0,Antibodies Part 1: CRISPR,"Jad Abumrad: Wait, wait... You're listening ..."
Unlocking Us,,2020.0,Brené with Priya Parker on The Art of Gathering,Transcript Brené Brown : Hi everyone. I’m B...
This American Life,268.0,,My Experimental Phase,Prologue Ira Glass How exactly is i...
Sawbones,343.0,2020.0,Osteopathic Medicine,Sawbones 343: Osteopathic Medicine Published 1...


In [9]:
data.index.value_counts()

This American Life           732
Radiolab                     261
Welcome to Nightvale         170
Move Your DNA                108
Bullseye with Jesse Thorn     63
MBMBaM                        32
Sawbones                      30
One Bad Mother                29
Wonderful                     29
Shmanners                     28
The Greatest Generation       28
Judge John Hodgman            28
Friendly Fire                 28
NeoScum                       20
The Adventure Zone            19
The Flophouse                 16
Switchblade Sisters           14
You're Wrong About            13
Unlocking Us                  12
Name: podcast, dtype: int64

In [10]:
data.loc[data.index=='NeoScum'].Text[0][:1000]

'Mike Migdall (MM): Hey everybody and welcome to the NeoScum actual play podcast!  Gannon Reedy (GR): Oh fuck yeah! [laughs]  [everyone laughs]   MM: You wanna start over?  [crosstalk; the theme to the podcast begins to play. It sounds sort of like the Full House theme a little bit? There’s a Great Saxophone Part in it.]  MM: Heeeeey!  GR: Heeeeeey! Aww, yes!   MM: Yes!!  GR: Welcome to NeoScum!  MM: A Shadowrun actual play roleplaying podcast.   GR: My name is Gannon Reedy.  MM: My name is Mike Migdall  GR: Dude… you’re listening to us, but do you even know what Shadowrun is?  MM: Do you? Well that’s why we’re here, to tell you what Shadowrun is.  GR: It’s like a futuristic D&D type game. I like to describe it as Blade Runner but with orcs.   MM: Woah. That’s cool, because I like to describe as Blade Runner except with orcs, elves, dwarves, and gnomes. And humans.  GR: Hmm. I guess that’s more accurate.  MM: And also centaur. I’m getting a “go longer” from my producer.  GR: Okay. Go l

In [11]:
# get rid of texts less than 6500 characters
podcast_df = pd.DataFrame()

for i in range(len(data)):
    if len(data.iloc[i, 3]) > 6500:  # I kept changing this number to see what returned, this gets rid of the erroneous text
        podcast_df = podcast_df.append(data.iloc[i, :])

In [12]:
podcast_df

Unnamed: 0,Episode,Text,Title,Year
Welcome to Nightvale,000a,"CECIL: As a matter of fact, the facts don’t ma...",matter of blood part 2,2018
Welcome to Nightvale,001,"And now the news. Old Woman Josie, out near th...",pilot,2012
Welcome to Nightvale,002,"And now, the news. Have any of our listeners s...",glow cloud,2012
Welcome to Nightvale,003,The Night Vale Business Association is proud t...,station management,2012
Welcome to Nightvale,004,The Pteranodons mostly attacked women with gla...,pta meeting,2012
...,...,...,...,...
NeoScum,16,"Mike Migdall (MM): So, daddy, what happens? ...",Quarter to Dead,
NeoScum,17,Gannon Reedy (GR): Okay so are we ready to sta...,Raw Deal,
NeoScum,18,Casey Toney (CT): [reading sponsor blurb] Char...,Open Sesame,
NeoScum,19,"Gannon Reedy (GR): Dude, I just broke it to my...",Night Shift,


In [13]:
podcast_df.sample(10)

Unnamed: 0,Episode,Text,Title,Year
This American Life,88.0,Prologue Ira Glass Andrea was work...,Numbers,
Radiolab,,"Introducers: Wait, wait, you're listening .....",Asking for a Friend,2019.0
This American Life,46.0,Act One: Anti-Oedipus Different Voic...,Sissies,
This American Life,96.0,Prologue Ira Glass 102557. The numb...,Pinned by History,
This American Life,126.0,Prologue Ira Glass From WBEZ Chicag...,Do-Gooders,
The Greatest Generation,317.0,Note: This show periodically replaces their ad...,Final Draft,
Move Your DNA,54.0,KATY: They’re essays that have never been pu...,Sedentary Culture in the News,
Wonderful,151.0,Wonderful! 151: Michaels Soul Connections Publ...,Michaels Soul Connections 1,2020.0
This American Life,30.0,Act One: Act One Ira Glass So Paul...,Obsession,
Radiolab,,Jad Abumrad: This is Jad Abumrad. This is Rad...,In The Dust Of This Planet,2014.0


### Podcast-specific info

Podcast-specific columns:  
    *- # of hosts - 0.5 indicates a whether or not the podcast regularly has guests
    *- genre, topic
    *- scripted/unscripted
    *- fiction/nonfiction
    *- format (interview, chat, etc)
        *- chat is generally talking about things, recap is talking about a specific thing (tv show, movie, etc)
    *- rating (itunes)
        *- I anticipated all ratings to be 4 or above, not 4.6 or above.  That teeny-tiny margin doesn't seem like it will be useful for analysis

This has been the hardest part of the project so far.  A lot of these categories are open to interpretation.

In [14]:
pod_feats = [['Welcome to Nightvale', 1, ['comedy', 'sci-fi'], 'scripted', 'fiction', 'news', 4.8],
             ['Move Your DNA', 2, ['health', 'fitness'], 'unscripted', 'nonfiction', 'chat', 4.8],
             ['You\'re Wrong About', 2, ['history', 'education'], 'unscripted', 'nonfiction', 'chat', 4.6],
             ['Unlocking Us', 1.5, ['health', 'lifestyle'], 'unscripted', 'nonfiction', 'interview', 4.6],
             ['Radiolab', 2, ['society', 'education'], 'unscripted', 'nonfiction', 'storytelling', 4.7],
             ['This American Life', 1.5, ['society','history'], 'unscripted', 'nonfiction', 'storytelling', 4.6],
             ['Bullseye with Jesse Thorn' , 1.5, ['comedy', 'society'], 'unscripted', 'nonfiction', 'interview', 4.7],
             ['One Bad Mother', 2.5, ['comedy', 'parenting'], 'unscripted', 'nonfiction', 'chat', 4.7],
             ['Judge John Hodgman', 1.5, ['comedy, advice'], 'unscripted', 'nonfiction', 'chat', 4.8],
             ['The Flophouse' , 3, ['comedy', 'movies'], 'unscripted', 'nonfiction', 'recap', 4.8],
             ['Switchblade Sisters', 1.5, ['comedy', 'movies'], 'unscripted', 'nonfiction', 'chat', 4.9],
             ['MBMBaM', 3, ['comedy','advice'], 'unscripted', 'nonfiction', 'chat', 4.9],
             ['Sawbones', 2, ['history', 'medicine'], 'unscripted', 'nonfiction', 'storytelling', 4.8],
             ['Wonderful', 2, ['comedy', 'society'], 'unscripted', 'nonfiction', 'chat', 4.9],
             ['The Greatest Generation', 2, ['comedy', 'TV'], 'unscripted', 'nonfiction', 'recap', 4.9],
             ['Friendly Fire', 3, ['history', 'movies'], 'unscripted', 'nonfiction', 'recap', 4.6],
             ['Shmanners', 2, ['society', 'advice'], 'unscripted', 'nonfiction', 'chat', 4.8],
             ['The Adventure Zone', 3, ['games', 'RP'], 'unscripted', 'fiction', 'LARP', 4.9],
             ['NeoScum', 5, ['games', 'RP'], 'unscripted', 'fiction', 'LARP', 4.9]]

# In case you're a cool person reading this and don't know, LARP is live action role playing.

In [15]:
pod_feats_df = pd.DataFrame(pod_feats, columns = ['podcast', 'Hosts', 'Genre-Topic', 
                                                  'Scripted/Un', 'Fiction/Non', 
                                                  'Format', 'Rating']).set_index('podcast')
pod_feats_df

Unnamed: 0_level_0,Hosts,Genre-Topic,Scripted/Un,Fiction/Non,Format,Rating
podcast,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Welcome to Nightvale,1.0,"[comedy, sci-fi]",scripted,fiction,news,4.8
Move Your DNA,2.0,"[health, fitness]",unscripted,nonfiction,chat,4.8
You're Wrong About,2.0,"[history, education]",unscripted,nonfiction,chat,4.6
Unlocking Us,1.5,"[health, lifestyle]",unscripted,nonfiction,interview,4.6
Radiolab,2.0,"[society, education]",unscripted,nonfiction,storytelling,4.7
This American Life,1.5,"[society, history]",unscripted,nonfiction,storytelling,4.6
Bullseye with Jesse Thorn,1.5,"[comedy, society]",unscripted,nonfiction,interview,4.7
One Bad Mother,2.5,"[comedy, parenting]",unscripted,nonfiction,chat,4.7
Judge John Hodgman,1.5,"[comedy, advice]",unscripted,nonfiction,chat,4.8
The Flophouse,3.0,"[comedy, movies]",unscripted,nonfiction,recap,4.8


In [16]:
podcast_df = pod_feats_df.join(podcast_df, on='podcast', sort=True)
podcast_df.sample(10)

Unnamed: 0_level_0,Hosts,Genre-Topic,Scripted/Un,Fiction/Non,Format,Rating,Episode,Text,Title,Year
podcast,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Sawbones,2.0,"[history, medicine]",unscripted,nonfiction,storytelling,4.8,327.0,Sawbones 327: COVID-19 and Bad Data Published ...,COVID 19 and Bad Data,2020.0
Bullseye with Jesse Thorn,1.5,"[comedy, society]",unscripted,nonfiction,interview,4.7,,"music Gentle, trilling music with a steady ...",David Cross,
This American Life,1.5,"[society, history]",unscripted,nonfiction,storytelling,4.6,607.0,Prologue Ira Glass So say your nam...,Didn’t We Solve This One?,
Sawbones,2.0,"[history, medicine]",unscripted,nonfiction,storytelling,4.8,343.0,Sawbones 343: Osteopathic Medicine Published 1...,Osteopathic Medicine,2020.0
Bullseye with Jesse Thorn,1.5,"[comedy, society]",unscripted,nonfiction,interview,4.7,,"music Gentle, trilling music with a steady ...",A One Man Show”,
This American Life,1.5,"[society, history]",unscripted,nonfiction,storytelling,4.6,539.0,"Prologue Ira Glass Now, a man who t...",The Leap,
This American Life,1.5,"[society, history]",unscripted,nonfiction,storytelling,4.6,645.0,Prologue Ira Glass Zoe Chace? Zoe ...,My Effing First Amendment,
This American Life,1.5,"[society, history]",unscripted,nonfiction,storytelling,4.6,730.0,Prologue: Prologue Announcer A qui...,The Empty Chair,
Bullseye with Jesse Thorn,1.5,"[comedy, society]",unscripted,nonfiction,interview,4.7,,"music Gentle, trilling music with a steady ...","Musician Phil Elverum of the Microphones, Mou...",
Shmanners,2.0,"[society, advice]",unscripted,nonfiction,chat,4.8,217.0,"Shmanners 217: Drive Ins Published July 3rd, 2...",Drive Ins,2020.0


In [17]:
podcast_df.index.value_counts()

This American Life           731
Radiolab                     192
Welcome to Nightvale         168
Move Your DNA                104
Bullseye with Jesse Thorn     63
MBMBaM                        32
Sawbones                      30
One Bad Mother                29
Wonderful                     29
Shmanners                     28
The Greatest Generation       28
Judge John Hodgman            28
Friendly Fire                 28
NeoScum                       20
The Adventure Zone            19
The Flophouse                 16
Switchblade Sisters           14
You're Wrong About            13
Unlocking Us                  12
Name: podcast, dtype: int64

### Text-processing functions

In [18]:
# anticipating a log of ugly floats
def percent(decimal):
    decimal *= 100
    percentage = '{:.3f}'.format(decimal)
    percentage = float(percentage)
    return percentage

percent(0.45615981981)

45.616

In [19]:
mini = podcast_df.sample(5)
tokens = mini.Text.map(nlp)
tokens

podcast
This American Life    (   , Prologue,        , Ira, Glass,  , Aphrod...
Move Your DNA         (Cedarsong, Nature, School,  \n  , Natural, St...
Shmanners             (Shmanners, 217, :, Drive, Ins, Published, Jul...
This American Life    (   , Prologue,        , Ira, Glass,   , I, 'm...
This American Life    (   , Prologue,        , Ira, Glass,  , At, fi...
Name: Text, dtype: object

In [20]:
podcast_df

Unnamed: 0_level_0,Hosts,Genre-Topic,Scripted/Un,Fiction/Non,Format,Rating,Episode,Text,Title,Year
podcast,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Bullseye with Jesse Thorn,1.5,"[comedy, society]",unscripted,nonfiction,interview,4.7,,"jesse thorn Hey all, it’s Jesse. As 2020 dr...",Cartoonist and Author Adrian Tomine,
Bullseye with Jesse Thorn,1.5,"[comedy, society]",unscripted,nonfiction,interview,4.7,,"music Gentle, trilling music with a steady ...","Cristin Milioti on ‘Palm Springs,’ ‘How I Met...",
Bullseye with Jesse Thorn,1.5,"[comedy, society]",unscripted,nonfiction,interview,4.7,,"music Gentle, trilling music with a steady ...",Musician Frank Turner,
Bullseye with Jesse Thorn,1.5,"[comedy, society]",unscripted,nonfiction,interview,4.7,,"music Gentle, trilling music with a steady ...",Isiah Whitlock Jr.,
Bullseye with Jesse Thorn,1.5,"[comedy, society]",unscripted,nonfiction,interview,4.7,,"music Gentle, trilling music with a steady ...","Maya Erskine and Anna Konkle of PEN15, back f...",
...,...,...,...,...,...,...,...,...,...,...
You're Wrong About,2.0,"[history, education]",unscripted,nonfiction,chat,4.6,,Sarah: Yeah. Instead of MLMs in the early ei...,The O.J. Simpson Trial: The DeLorean Detour,2021
You're Wrong About,2.0,"[history, education]",unscripted,nonfiction,chat,4.6,,"Sarah Marshall: Oh, my God. It's like a cat o...",Vanessa Williams Part 1: Becoming Miss America,2021
You're Wrong About,2.0,"[history, education]",unscripted,nonfiction,chat,4.6,,"Mike : Ooh, I have one, I have one I have one...","Bonus: ""The Dark Knight""",2021
You're Wrong About,2.0,"[history, education]",unscripted,nonfiction,chat,4.6,,Sarah: I think that if we were going to have...,Tipper Gore vs. Heavy Metal: The Hearing,2021


## Analysis
Columns: Tokens, Top50, Token_count, Avg_token_len, TTR, kband, Bigrams, Bigram_top25, Sent_toks, Avg_sent_len, POS_frequency, POS_length, Verb_lemmas, Ent_count

In [21]:
# Tokenize
podcast_df['Tokens'] = podcast_df.Text.map(nlp)

In [130]:
podcast_df.Tokens.loc['Welcome to Nightvale'][2][:500]

And now, the news. Have any of our listeners seen the glowing cloud that has been moving in from the west? Well, John Peters, you know, the farmer? He saw it over the Western Ridge this morning, said he would have thought it was the setting sun if it wasn’t for the time of day. Apparently the cloud glows in a variety of colors, perhaps changing from observer to observer, although all report a low whistling when it draws near. One death has already been attributed to the glowcloud.  But listen, it’s probably nothing. If we had to shut down the town for every mysterious event that at least one death could be attributed to, we’d never have time to do anything, right? That’s what the Sheriff’s Secret Police are saying, and I agree, although I would not go so far as to endorse their suggestion to “run directly at the cloud, shrieking and waving your arms, just to see what it does.”  The Apache Tracker, and I remind you that this is that white guy who wears the huge and cartoonishly inaccura

In [23]:
def top50(Tokens):
    counts = Counter(t.text for t in Tokens if t.is_alpha)
    return counts.most_common(50)

In [24]:
podcast_df['Top50'] = podcast_df.Tokens.map(top50)

In [26]:
podcast_df['Token_count'] = podcast_df.Tokens.map(len)

In [27]:
def word_len(Tokens):
    if len(Tokens) > 10:
        lengths = [(w, len(w.text)) for w in Tokens if w.is_alpha]
    else:
        lengths = [('null',0)]
    
    avg = statistics.mean([l[-1] for l in lengths])
    
    return lengths, avg

In [28]:
podcast_df['Token_lengths'] = podcast_df.Tokens.map(lambda x: word_len(x)[0])

In [29]:
podcast_df['Avg_token_len'] = podcast_df.Tokens.map(lambda x: word_len(x)[1])

In [30]:
# TTR
def get_ttr(Tokens):
    if len(Tokens) > 1:
        lower = [t.text.lower() for t in Tokens if t.is_alpha]
        ttr = percent(len(set(lower))/len(lower))
    else:
        ttr = 0
        
    return ttr

In [31]:
podcast_df['TTR'] = podcast_df.Tokens.map(get_ttr)

In [34]:
import pickle
f = open('data/goog_kband.pkl','rb')
goog_kband = pickle.load(f)
f.close()

goog_kband['throughout']

2

In [35]:
def get_kband(Tokens):
    if len(Tokens) > 1:
        kbands = []
        for t in Tokens:
            if t.lemma_ in goog_kband:
                kbands.append((t, goog_kband[t.lemma_]))
        avg_kband = statistics.mean([t[1] for t in kbands])
    else:
        kbands = 0
        avg_kband = 0
    
    return kbands, avg_kband

In [36]:
podcast_df['kband'] = podcast_df.Tokens.map(lambda x: get_kband(x)[0])

In [37]:
podcast_df['Avg_kband'] = podcast_df.Tokens.map(lambda x: get_kband(x)[1])

In [40]:
def bigrams(Tokens):
    if len(Tokens) > 1:
        bigrams = []
        for t in Tokens[:-1]:
            if t.text.isalpha() and Tokens[t.i + 1].text.isalpha():
                bigram = (t.text.lower(), Tokens[t.i + 1].text.lower())
                bigrams.append(bigram)
        counts = Counter(b for b in bigrams).most_common(25)
    else:
        bigrams = 'null'
        
    return bigrams

In [41]:
# add bigrams column
podcast_df['Bigrams'] = podcast_df.Tokens.map(lambda x: bigrams(x))

In [42]:
# FreqDist bigrams
podcast_df['Bigram_top25'] = podcast_df.Bigrams.map(lambda x: Counter(x).most_common(25))

In [43]:
podcast_df['POS'] = podcast_df.Tokens.map(lambda t: [(w, w.pos_) for w in t])

In [44]:
# weighs pos frequency against total text length
def POS_frequency(POS_text):
    counts = Counter(elem[-1].upper() for elem in POS_text)
    total = len(POS_text)
    
    pos_freq = {}
    for (pos, count) in counts.items():
        pos_freq[pos] = percent(count/total)
        
    return pos_freq

In [45]:
podcast_df['POS_freq'] = podcast_df.POS.map(POS_frequency)

In [46]:
podcast_df.POS_freq[0]['NOUN']

11.754

In [47]:
podcast_df['Noun_freq'] = podcast_df.POS_freq.map(lambda x: x.get('NOUN', 'null'))
podcast_df['Verb_freq'] = podcast_df.POS_freq.map(lambda x: x.get('VERB', 'null'))
podcast_df['Adj_freq'] = podcast_df.POS_freq.map(lambda x: x.get('ADJ', 'null'))
podcast_df['Adv_freq'] = podcast_df.POS_freq.map(lambda x: x.get('ADV', 'null'))

In [48]:
podcast_df.sample(5)

Unnamed: 0_level_0,Hosts,Genre-Topic,Scripted/Un,Fiction/Non,Format,Rating,Episode,Text,Title,Year,...,kband,Avg_kband,Bigrams,Bigram_top25,POS,POS_freq,Noun_freq,Verb_freq,Adj_freq,Adv_freq
podcast,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Welcome to Nightvale,1.0,"[comedy, sci-fi]",scripted,fiction,news,4.8,7,"In the interest of civic participation, Night ...",history week,2012.0,...,"[(In, 1), (the, 1), (interest, 1), (of, 1), (c...",2.247753,"[(in, the), (the, interest), (interest, of), (...","[((night, vale), 27), ((in, the), 17), ((of, t...","[(In, ADP), (the, DET), (interest, NOUN), (of,...","{'ADP': 10.615, 'DET': 9.899, 'NOUN': 20.388, ...",20.388,10.952,7.287,4.886
This American Life,1.5,"[society, history]",unscripted,nonfiction,storytelling,4.6,611,Prologue Ira Glass Saturday mornin...,Vague and Confused,,...,"[(morning, 2), (four, 1), (days, 1), (after, 1...",1.682484,"[(ira, glass), (saturday, morning), (four, day...","[((david, kestenbaum), 57), ((clarence, barry)...","[( , SPACE), (Prologue, NOUN), ( , SPA...","{'SPACE': 4.866, 'NOUN': 12.087, 'PROPN': 7.81...",12.087,12.115,3.886,5.22
Sawbones,2.0,"[history, medicine]",unscripted,nonfiction,storytelling,4.8,352,Sawbones 352: COVID-19: The Final Mile Publish...,COVID 19 The Final Mile,2021.0,...,"[(The, 1), (Published, 5), (Listen, 2), (here,...",1.695554,"[(the, final), (final, mile), (mile, published...","[((i, think), 37), ((if, you), 33), ((you, can...","[(Sawbones, NOUN), (352, NUM), (:, PUNCT), (CO...","{'NOUN': 10.857, 'NUM': 0.705, 'PUNCT': 16.638...",10.857,12.4,4.333,5.295
This American Life,1.5,"[society, history]",unscripted,nonfiction,storytelling,4.6,615,Prologue Ira Glass So explain this...,The Beginning of Now,,...,"[(So, 1), (explain, 4), (this, 1), (recording,...",1.767445,"[(ira, glass), (so, explain), (explain, this),...","[((of, the), 59), ((zoe, chace), 53), ((in, th...","[( , SPACE), (Prologue, NOUN), ( , SPA...","{'SPACE': 3.581, 'NOUN': 13.209, 'PROPN': 10.9...",13.209,10.562,4.17,4.947
This American Life,1.5,"[society, history]",unscripted,nonfiction,storytelling,4.6,515,"Prologue Ira Glass OK, here's what ...",Good Guys,,...,"[(OK, 2), (here, 1), ('s, 1), (what, 1), (His,...",1.667843,"[(ira, glass), (what, intrigued), (intrigued, ...","[((sarah, koenig), 48), ((ben, calhoun), 45), ...","[( , SPACE), (Prologue, NOUN), ( , SPA...","{'SPACE': 5.02, 'NOUN': 10.926, 'PROPN': 6.373...",10.926,12.177,4.272,6.152


In [49]:
def POS_length(POS_text):
    pos_dict = {'NOUN': 0, 'VERB': 0, 'ADV': 0, 'ADJ': 0}
    pron_dict = {'i': 0, 'you': 0, 'she': 0, 'he': 0, 'it': 0, 'they': 0, 'we': 0}
    for (token, pos) in POS_text:
        if pos in pos_dict.keys():
            pos_dict[pos] = (pos_dict[pos] + len(token.text))/2
        if token.text in pron_dict.keys():
            pron_dict[token.text] = pron_dict[token.text] + 1
    
    if sum(pron_dict.values()) != 0:
        pron_total = sum(pron_dict.values())
    
    if sum(pron_dict.values()) != 0:
        for (p, c) in pron_dict.items():
            pron_dict[p] = percent(c/pron_total)
    
    
    return pos_dict, pron_dict

# Average word length of each POS
# POS_length[0][0] = noun
#           [0][1] = verb
#           [0][2] = adv
#           [0][3] = adj

# Individual pronoun occurrence weighed against total # of pronouns
# POS_length[1][1] = 'i'
#           [1][2] = 'you'
#           [1][3] ='she'
#           [1][4] = 'he'
#           [1][5] = 'it'
#           [1][6] = 'they'
#           [1][7] = 'we'


In [52]:
podcast_df['POS_length'] = podcast_df.POS.map(lambda p: POS_length(p)[0])

In [53]:
podcast_df['Avg_noun_len'] = podcast_df.POS_length.map(lambda d: d['NOUN'])
podcast_df['Avg_verb_len'] = podcast_df.POS_length.map(lambda d: d['VERB'])
podcast_df['Avg_adj_len'] = podcast_df.POS_length.map(lambda d: d['ADJ'])
podcast_df['Avg_adv_len'] = podcast_df.POS_length.map(lambda d: d['ADV'])

In [60]:
podcast_df['Pron_counts'] = podcast_df.POS.map(lambda p: POS_length(p)[1])

In [61]:
podcast_df['i_count'] = podcast_df.Pron_counts.map(lambda d: d['i'])
podcast_df['you_count'] = podcast_df.Pron_counts.map(lambda d: d['you'])
podcast_df['she_count'] = podcast_df.Pron_counts.map(lambda d: d['she'])
podcast_df['he_count'] = podcast_df.Pron_counts.map(lambda d: d['he'])
podcast_df['it_count'] = podcast_df.Pron_counts.map(lambda d: d['it'])
podcast_df['they_count'] = podcast_df.Pron_counts.map(lambda d: d['they'])
podcast_df['we_count'] = podcast_df.Pron_counts.map(lambda d: d['we'])

In [66]:
podcast_df.POS_freq[0]
podcast_df.Noun_freq[0]

{'SPACE': 4.374,
 'X': 1.097,
 'INTJ': 1.192,
 'ADV': 6.689,
 'PUNCT': 13.771,
 'PRON': 11.388,
 'VERB': 13.108,
 'PROPN': 2.83,
 'ADP': 8.707,
 'NUM': 0.501,
 'DET': 8.504,
 'NOUN': 11.754,
 'ADJ': 4.469,
 'SCONJ': 1.557,
 'AUX': 4.482,
 'CCONJ': 3.737,
 'PART': 1.828,
 'SYM': 0.014}

11.754

In [54]:
# weighs occurrence of each ent against total text length
def ent_counter(Tokens):
    counts = Counter(elem.label_ for elem in Tokens.ents)
    # print(sum(counts.values()))
    
    ent_counter = {}
    for (ent, value) in counts.items():
        ent_counter[ent] = percent(value/len(Tokens))
    # print(sum(ent_counter.values()))
    
    return ent_counter

In [71]:
podcast_df.groupby('podcast').Token_count.min()

podcast
Bullseye with Jesse Thorn     1951
Friendly Fire                13263
Judge John Hodgman           10792
MBMBaM                       14422
Move Your DNA                 2278
NeoScum                      13765
One Bad Mother               12230
Radiolab                      2116
Sawbones                      8504
Shmanners                     8035
Switchblade Sisters           9321
The Adventure Zone            9152
The Flophouse                20331
The Greatest Generation      17336
This American Life            8591
Unlocking Us                  5619
Welcome to Nightvale          2156
Wonderful                     8823
You're Wrong About            5811
Name: Token_count, dtype: int64

In [102]:
# how do I read all the text??
podcast_df.loc[podcast_df.Token_count==1951].Text[:]

podcast
Bullseye with Jesse Thorn      jesse thorn  It’s  Bullseye  . I’m Jesse Tho...
Name: Text, dtype: object

In [83]:
pickle.format_version

'4.0'

In [86]:
sum(podcast_df.Token_count)

18270894

In [104]:
# most common verb lemmas
def verb_lemmas(POS_text):
    counts = Counter(elem[0].lemma_ for elem in POS_text if elem[1] == 'VERB')
    
    verb_counter = {}
    for (verb, value) in counts.most_common(20):
        verb_counter[verb] = percent(value/sum(counts.values()))
        
    return verb_counter

verb_lemmas(podcast_df.POS[1])

{'know': 9.301,
 '’': 9.077,
 'be': 9.003,
 'do': 5.208,
 'think': 4.613,
 'have': 3.125,
 'feel': 2.827,
 'get': 2.307,
 'mean': 2.232,
 'see': 2.009,
 'gon': 1.786,
 'say': 1.637,
 'laugh': 1.562,
 'go': 1.562,
 'watch': 1.414,
 'love': 1.414,
 '’re': 1.265,
 'start': 1.19,
 'wanna': 1.116,
 'want': 1.116}

In [105]:
podcast_df['verb_lemmas'] = podcast_df.POS.map(verb_lemmas)

In [115]:
podcast_df['Sent_toks'] = podcast_df.Text.map(nltk.sent_tokenize)

In [116]:
# minor alteration to unit_len
def sent_len(doc):
    sentlens = []
    for c in doc:
        length = len([l for l in c.split()])
        sentlens.append((c, length))
        
    return sentlens

In [123]:
sent_len(podcast_df.Sent_toks[0][:10])

[('  jesse thorn  Hey all, it’s Jesse.', 6),
 ('As 2020 draws to a close, think about what you’re thankful for, other than—I’m willing to bet—2020 drawing to a close.',
  21),
 ('What got you through the year?', 6),
 ('Odds are, if you’re hearing my voice, public radio was one of the things.',
  14),
 ('Public radio gave you accurate, dependable news about the election on the pandemic, information about local stories that matter to you.',
  21),
 ('You got fun and fascinating interviews from shows like  Bullseye  .', 11),
 ('If you wanna show your gratitude at the end of this year, consider supporting your local public radio station.',
  19),
 ('Public radio stations really need your help right now, more than ever.',
  12),
 ('And it’s really easy to do!', 6),
 ('Just go to  Donate.NPR.org/bullseye  and give whatever you can.', 9)]

In [118]:
podcast_df['Sent_length'] = podcast_df.Sent_toks.map(sent_len)

How to extract host names??  Some are full names and some are just first names.  Maybe compare top 50 tokens and enable/kbands.  Any word thats in top50 and not kband 1 is likely a name (right?  maybe?)

In [120]:
podcast_df.loc['The Flophouse']

Unnamed: 0_level_0,Hosts,Genre-Topic,Scripted/Un,Fiction/Non,Format,Rating,Episode,Text,Title,Year,...,i_count,you_count,she_count,he_count,it_count,they_count,we_count,verb_lemmas,Sent_toks,Sent_length
podcast,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
The Flophouse,3.0,"[comedy, movies]",unscripted,nonfiction,recap,4.8,315,dan mccoy On this episode we discuss: Hawk...,Hawk the Slayer,,...,0.0,24.118,3.527,16.243,33.798,10.747,11.567,"{'’': 11.445, 'laugh': 6.435, 'be': 6.197, 'ha...",[ dan mccoy On this episode we discuss: Haw...,[( dan mccoy On this episode we discuss: Ha...
The Flophouse,3.0,"[comedy, movies]",unscripted,nonfiction,recap,4.8,316,dan mccoy On this episode we discuss: Betw...,Between Worlds,,...,0.0,22.166,9.828,17.073,33.788,8.895,8.25,"{'’': 10.071, 'laugh': 6.471, 'be': 6.165, 'ha...",[ dan mccoy On this episode we discuss: Bet...,[( dan mccoy On this episode we discuss: Be...
The Flophouse,3.0,"[comedy, movies]",unscripted,nonfiction,recap,4.8,317,"dan mccoy On this episode, we discuss: Art...","Artemis Fowl, w/ Scott Weinberg",,...,0.0,27.886,3.825,12.239,32.267,12.239,11.544,"{'’': 10.731, 'laugh': 6.122, 'be': 5.332, 'do...","[ dan mccoy On this episode, we discuss: Ar...","[( dan mccoy On this episode, we discuss: A..."
The Flophouse,3.0,"[comedy, movies]",unscripted,nonfiction,recap,4.8,318,dan mccoy On this episode we discuss: Huds...,"Hudson Hawk, with Roman Mars",,...,0.0,21.658,3.583,14.92,36.845,15.134,7.861,"{'’': 10.934, 'laugh': 7.296, 'be': 6.18, 'get...",[ dan mccoy On this episode we discuss: Hud...,[( dan mccoy On this episode we discuss: Hu...
The Flophouse,3.0,"[comedy, movies]",unscripted,nonfiction,recap,4.8,319,"music Light, up-tempo, electric guitar with...",Battle Angel LIVE,,...,0.0,30.126,9.622,13.058,25.888,8.935,12.371,"{'’': 10.802, 'laugh': 9.554, 'be': 5.348, 'ha...","[ music Light, up-tempo, electric guitar wit...","[( music Light, up-tempo, electric guitar wi..."
The Flophouse,3.0,"[comedy, movies]",unscripted,nonfiction,recap,4.8,320,"dan On this episode, we discuss— Last Chri...",Last Christmas,,...,0.0,24.723,11.594,10.827,34.101,7.076,11.679,"{'’': 10.197, 'be': 6.893, 'laugh': 5.013, 'ha...","[ dan On this episode, we discuss— Last Chr...","[( dan On this episode, we discuss— Last Ch..."
The Flophouse,3.0,"[comedy, movies]",unscripted,nonfiction,recap,4.8,321,dan On this episode we discuss— The Call o...,"Call of the Wild, with Jesse Thorn",,...,0.0,19.938,1.336,20.144,38.335,13.155,7.091,"{'’': 9.697, 'be': 7.636, 'laugh': 5.788, 'hav...",[ dan On this episode we discuss— The Call ...,[( dan On this episode we discuss— The Call...
The Flophouse,3.0,"[comedy, movies]",unscripted,nonfiction,recap,4.8,322,dan On this episode we discuss— Money Plan...,Money Plane,,...,0.0,23.629,3.226,16.048,30.645,15.242,11.21,"{'’': 11.334, 'be': 5.524, 'laugh': 5.263, 'ha...",[ dan On this episode we discuss— Money Pla...,[( dan On this episode we discuss— Money Pl...
The Flophouse,3.0,"[comedy, movies]",unscripted,nonfiction,recap,4.8,323,dan On this episode we discuss— Deadly Les...,Deadly Lessons,,...,0.0,22.941,3.012,15.323,36.138,11.515,11.072,"{'’': 10.864, 'be': 6.25, 'have': 5.053, 'laug...",[ dan On this episode we discuss— Deadly Le...,[( dan On this episode we discuss— Deadly L...
The Flophouse,3.0,"[comedy, movies]",unscripted,nonfiction,recap,4.8,324,dan mccoy On this episode of The Flop Hous...,Hellboy LIVE,,...,0.0,27.525,3.843,15.013,33.333,8.847,11.439,"{'’': 9.804, 'laugh': 9.696, 'be': 4.915, 'hav...",[ dan mccoy On this episode of The Flop Hou...,[( dan mccoy On this episode of The Flop Ho...
