# All podcasts

### Load elements and build DataFrame

In [None]:
import pandas as pd
import re
import nltk
import spacy
from spacy.lang.en import English
import collections
from collections import Counter
from itertools import chain
import statistics
import math
import pickle

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [None]:
# initialize spacy objects
nlp = spacy.load('en_core_web_md')

In [None]:
%store -r Nightvale_df
%store -r myDNA_df
%store -r YWA_df
%store -r uu_df
%store -r radiolab_df
%store -r tal_df
%store -r bullseye
%store -r mother
%store -r hodgman
%store -r flophouse
%store -r switchblade
%store -r mbmbam
%store -r sawbones
%store -r wonderful
%store -r tgg
%store -r ffire
%store -r shmanners
%store -r taz



# %store -r freak_df
# %store -r Lore_df
# %store -r Invisible_df
# %store -r OnBeing_df
# %store -r StoryCorps_df

In [None]:
# create a multindexed dataframe
data = pd.concat([Nightvale_df.reset_index(drop=True),
                  myDNA_df.reset_index(drop=True),
                  YWA_df.reset_index(drop=True),                  
                  uu_df.reset_index(drop=True),
                  radiolab_df.reset_index(drop=True),
                  tal_df.reset_index(drop=True),
                  bullseye,
                  mother,
                  hodgman,
                  flophouse,
                  switchblade,
                  mbmbam,
                  sawbones,
                  wonderful,
                  tgg,
                  ffire,
                  shmanners,
                  taz],
                  # StoryCorps_df, OnBeing_df, Invisible_df, Lore_df],
                  keys = ['Nightvale','Move Your DNA','You\'re Wrong About','Unlocking Us',
                         'Radiolab','This American Life', 'Bullseye with Jesse Thorn','One Bad Mother',
                         'Judge John Hodgman','The Flophouse','Switchblade Sisters',
                         'MBMBaM','Sawbones','Wonderful','The Greatest Generation','Friendly Fire','Shmanners',
                          'The Adventure Zone'])

In [None]:
data = data.drop(columns=['Podcast'])
data.head(10)

In [None]:
data.Text[0][:1000]

### Text-processing functions

In [None]:
# anticipating a log of ugly floats
def percent(decimal):
    decimal *= 100
    percentage = '{:.3f}'.format(decimal)
    percentage = float(percentage)
    return percentage

percent(0.45615981981)

## Analysis
Columns: Tokens, Top50, Token_count, Avg_token_len, TTR, kband, Bigrams, Bigram_top25, Sent_toks, Avg_sent_len, POS_frequency, POS_length, Verb_lemmas, Ent_count

In [None]:
# Tokenize
data['Tokens'] = data.Text.map(nlp)

In [None]:
def top50(Tokens):
    counts = Counter(t.text for t in Tokens if t.is_alpha)
    return counts.most_common(50)

In [None]:
data['Top50'] = data.Tokens.map(top50)

In [None]:
data['Token_count'] = data.Tokens.map(len)

In [None]:
def word_len(Tokens):
    if len(Tokens) > 10:
        lengths = [(w, len(w.text)) for w in Tokens if w.is_alpha]
    else:
        lengths = [('null',0)]
    
    avg = statistics.mean([l[-1] for l in lengths])
    
    return lengths, avg

In [None]:
data['Token_lengths'] = data.Tokens.map(lambda x: word_len(x)[0])

In [None]:
data['Avg_token_len'] = data.Tokens.map(lambda x: word_len(x)[1])

In [None]:
# TTR
def get_ttr(Tokens):
    if len(Tokens) > 1:
        lower = [t.text.lower() for t in Tokens if t.is_alpha]
        ttr = percent(len(set(lower))/len(lower))
    else:
        ttr = 0
        
    return ttr

In [None]:
data['TTR'] = data.Tokens.map(get_ttr)

In [None]:
#  import google kbands
f = open('data/goog_kband.pkl','rb')
goog_kband = pickle.load(f)
f.close()

goog_kband['throughout']

In [None]:
def get_kband(Tokens):
    if len(Tokens) > 1:
        kbands = []
        for t in Tokens:
            if t.lemma_ in goog_kband:
                kbands.append((t, goog_kband[t.lemma_]))
        avg_kband = statistics.mean([t[1] for t in kbands])
    else:
        kbands = 0
        avg_kband = 0
    
    return kbands, avg_kband

In [None]:
data['kband'] = data.Tokens.map(lambda x: get_kband(x)[0])

In [None]:
data['Avg_kband'] = data.Tokens.map(lambda x: get_kband(x)[1])

In [None]:
def bigrams(Tokens):
    if len(Tokens) > 1:
        bigrams = []
        for t in Tokens[:-1]:
            print(t)
            if t.text.isalpha() and Tokens[t.i + 1].text.isalpha():
                bigram = (t.text.lower(), Tokens[t.i + 1].text.lower())
                bigrams.append(bigram)
        counts = Counter(b for b in bigrams).most_common(25)
    else:
        bigrams = 'null'
        
    return bigrams

In [None]:
# add bigrams column
data['Bigrams'] = data.Tokens.map(lambda x: bigrams(x))

In [None]:
# FreqDist bigrams
data['Bigram_top25'] = data.Bigrams.map(lambda x: Counter(x).most_common(25))

In [None]:
data['POS'] = data.Tokens.map(lambda t: [(w, w.pos_) for w in t])

In [None]:
# weighs pos frequency against total text length
def POS_frequency(POS_text):
    counts = Counter(elem[-1].upper() for elem in POS_text)
    total = len(POS_text)
    
    pos_freq = {}
    for (pos, count) in counts.items():
        pos_freq[pos] = percent(count/total)
        
    return pos_freq

In [None]:
data['POS_freq'] = data.POS.map(POS_frequency)

In [None]:
data.POS_freq[0]['NOUN']

In [None]:
data['Noun_freq'] = data.POS_freq.map(lambda x: x.get('NOUN', 'null'))
data['Verb_freq'] = data.POS_freq.map(lambda x: x.get('VERB', 'null'))
data['Adj_freq'] = data.POS_freq.map(lambda x: x.get('ADJ', 'null'))
data['Adv_freq'] = data.POS_freq.map(lambda x: x.get('ADV', 'null'))

In [None]:
data.sample(5)

In [None]:
def POS_length(POS_text):
    pos_dict = {'NOUN': 0, 'VERB': 0, 'ADV': 0, 'ADJ': 0}
    pron_dict = {'i': 0, 'you': 0, 'she': 0, 'he': 0, 'it': 0, 'they': 0, 'we': 0}
    for (token, pos) in POS_text:
        if pos in pos_dict.keys():
            pos_dict[pos] = (pos_dict[pos] + len(token.text))/2
        if token.text in pron_dict.keys():
            pron_dict[token.text] = pron_dict[token.text] + 1
    
    if sum(pron_dict.values()) != 0:
        pron_total = sum(pron_dict.values())
    
    if sum(pron_dict.values()) != 0:
        for (p, c) in pron_dict.items():
            pron_dict[p] = percent(c/pron_total)
    
    
    return pos_dict, pron_dict

# Average word length of each POS
# POS_length[0][0] = noun
#           [0][1] = verb
#           [0][2] = adv
#           [0][3] = adj

# Individual pronoun occurrence weighed against total # of pronouns
# POS_length[1][1] = 'i'
#           [1][2] = 'you'
#           [1][3] ='she'
#           [1][4] = 'he'
#           [1][5] = 'it'
#           [1][6] = 'they'
#           [1][7] = 'we'


In [None]:
data['POS_length'] = data.POS.map(lambda p: POS_length(p)[0])

In [None]:
data['Avg_noun_len'] = data.POS_length.map(lambda d: d['NOUN'])
data['Avg_verb_len'] = data.POS_length.map(lambda d: d['VERB'])
data['Avg_adj_len'] = data.POS_length.map(lambda d: d['ADJ'])
data['Avg_adv_len'] = data.POS_length.map(lambda d: d['ADV'])

In [None]:
# weighs occurrence of each ent against total text length
def ent_counter(Tokens):
    counts = Counter(elem.label_ for elem in Tokens.ents)
    # print(sum(counts.values()))
    
    ent_counter = {}
    for (ent, value) in counts.items():
        ent_counter[ent] = percent(value/len(Tokens))
    # print(sum(ent_counter.values()))
    
    return ent_counter

In [None]:
data.sample(5)

### YOU ARE HERE

In [None]:
# most common verb lemmas
def verb_lemmas(POS_text):
    counts = Counter(elem[0].lemma_ for elem in POS_text if elem[1] == 'VERB')
    
    verb_counter = {}
    for (verb, value) in counts.most_common(20):
        verb_counter[verb] = percent(value/sum(counts.values()))
        
    return verb_counter

verb_lemmas(data.POS[1])

In [None]:
data

In [None]:
data['Sent_toks'] = data.Text.map(nltk.sent_tokenize)

In [None]:
# minor alteration to unit_len
def sent_len(doc):
    sentlens = []
    for c in doc:
        length = len([l for l in c.split()])
        sentlens.append((c, length))
        
    return sentlens

In [None]:
data['Sent_length'] = data.Sent_toks.map(sent_len)

How to extract host names??  Some are full names and some are just first names

In [None]:
data.loc['The Flophouse']