In [41]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import re
from itertools import chain

## Get Episode URLs

In [2]:
base_url = 'https://gimletmedia.com/shows/reply-all/episodes'

response = requests.get(base_url)

page = response.text

soup = BeautifulSoup(page, "html.parser")

In [3]:
def getEpisodes(soup):
    """
    grabs unique episode links from ReplyAll episodes list
    """
    
    table = soup.findAll('div', {'class' : 'title-section'})
    
    episodes = []
    for element in table:
        
        episodes.append(element.find('a')['href'])
    
    return episodes

In [4]:
episode_links = getEpisodes(soup)

In [5]:
episode_links

['/shows/reply-all/wbhawn/96-the-secret-life-of-alex-goldman',
 '/shows/reply-all/2oh5o6/128-the-crime-machine-part-ii',
 '/shows/reply-all/76h967/127-the-crime-machine-part-i',
 '/shows/reply-all/emh82l/120-invcel',
 '/shows/reply-all/xjhx3l/146-summer-hotline',
 '/shows/reply-all/rnhzlo/145-louder',
 '/shows/reply-all/6nhgol/144-dark-pattern',
 '/shows/reply-all/76hdrj/143-permanent-record',
 '/shows/reply-all/gmhr36/142-we-didnt-start-the-fire',
 '/shows/reply-all/39hrwn/41-what-it-looks-like',
 '/shows/reply-all/z3hgd2/141-adam-pisces-and-the-2-coke',
 '/shows/reply-all/brh8jm/140-the-roman-mars-mazda-virus',
 '/shows/reply-all/5whgo2/139-the-reply-all-hotline',
 '/shows/reply-all/j4h6jd/138-the-great-momo-panic',
 '/shows/reply-all/94hord/137-fools-trade',
 '/shows/reply-all/dvhd9k/136-the-founder',
 '/shows/reply-all/awhk76/135-robocall-bang-bang',
 '/shows/reply-all/8wh258/134-the-year-of-the-wallop',
 '/shows/reply-all/8who5k/133-reply-alls-2018-year-end',
 '/shows/reply-all/wb

In [6]:
len(episode_links) #need to remove rebroadcasts/gimlet podcast announcements

159

In [7]:
#remove gimlet show updates
pattern = '\/shows\/reply-all\/\w*\/[0-9]+'
episode_numbers = []

for episode in episode_links:
    if re.match(pattern, episode):
        episode_numbers.append(episode)

In [28]:
episode_numbers

['/shows/reply-all/xjhx3l/146-summer-hotline',
 '/shows/reply-all/rnhzlo/145-louder',
 '/shows/reply-all/6nhgol/144-dark-pattern',
 '/shows/reply-all/76hdrj/143-permanent-record',
 '/shows/reply-all/gmhr36/142-we-didnt-start-the-fire',
 '/shows/reply-all/z3hgd2/141-adam-pisces-and-the-2-coke',
 '/shows/reply-all/brh8jm/140-the-roman-mars-mazda-virus',
 '/shows/reply-all/5whgo2/139-the-reply-all-hotline',
 '/shows/reply-all/j4h6jd/138-the-great-momo-panic',
 '/shows/reply-all/94hord/137-fools-trade',
 '/shows/reply-all/dvhd9k/136-the-founder',
 '/shows/reply-all/awhk76/135-robocall-bang-bang',
 '/shows/reply-all/8wh258/134-the-year-of-the-wallop',
 '/shows/reply-all/8who5k/133-reply-alls-2018-year-end',
 '/shows/reply-all/wbhjwd/132-negative-mount-pleasant',
 '/shows/reply-all/v4he8w/131-surefire-investigations',
 '/shows/reply-all/v4he6k/130-the-snapchat-thief',
 '/shows/reply-all/o2ho2j/129-autumn',
 '/shows/reply-all/n8hwl7/128-the-crime-machine-part-ii',
 '/shows/reply-all/o2hx34/12

In [8]:
len(episode_numbers)

156

#### Some manual episode removal 
Can recreate with regex in another iteration

In [9]:
episode_numbers.remove('/shows/reply-all/emh82l/120-invcel')
episode_numbers.remove('/shows/reply-all/39hrwn/41-what-it-looks-like')
episode_numbers.remove('/shows/reply-all/rnholw/47-quit-already-rebroadcast')
episode_numbers.remove('/shows/reply-all/39hz7w/29-the-takeover-rebroadcast')
episode_numbers.remove('/shows/reply-all/xjhed4/6-this-proves-everything')
episode_numbers.remove('/shows/reply-all/94hw3v/1-an-app-sends-a-stranger-to-say-i-love')
episode_numbers.remove('/shows/reply-all/z3hld4/55-jennicam-revisited')
episode_numbers.remove('/shows/reply-all/gmhn8r/39-reply-all-exploder')
episode_numbers.remove('/shows/reply-all/76h967/127-the-crime-machine-part-i')
episode_numbers.remove('/shows/reply-all/2oh5o6/128-the-crime-machine-part-ii')
episode_numbers.remove('/shows/reply-all/wbhawn/96-the-secret-life-of-alex-goldman')

In [10]:
len(episode_numbers) #woo!

145

## Get Transcripts

In [14]:
gimlet = 'https://gimletmedia.com'

In [11]:
#This method gets most episodes (130). Loses episodes 65-77 
#from variation in webpage setup between episodes

def getTranscripts(url):
    
    """
    Takes in the epiosde url and scrapes trancripts 
    """
    
    r = requests.get(url)
    p = r.text
    s = BeautifulSoup(p, 'lxml')
    
    transcript_html = s.find('div', {'class': 'box transcript-box'})
    
    span_elements = transcript_html.findAll(['span', 'p'])
    
    text = ''
    
    for element in span_elements:
        text += element.text
    
    return text

In [15]:
def createDocuments():
    """
    format transcripts for insertion into MongoDB
    """
    
    document_list = []
    
    for episode in episode_numbers:
        url = gimlet + episode
    
        try:

            document_list.append({
                'episodeID' : episode,
                'full_trans' : getTranscripts(url),
                'by_speaker': {
                    
                    'PJ': '',
                    'Alex': ''
                }
                })

        except AttributeError:
            print(episode)
    
    return document_list

In [16]:
total_dictionaries = createDocuments() 

/shows/reply-all/dvhe8r/47-quit-already


## Get Speaker Names and Text

In [18]:
def getName(document):
    
    names = []
    clean_names = []
    
    names.append(re.findall("[A-Z ]+:", document['full_trans']))
    

    for name in names:       
        
        name = [x.replace('BREAK', '') for x in name]
        name = [x.replace('MUSIC', '') for x in name]
        name = [x.replace('CREDITS', '') for x in name]
        
        name = [x.strip() for x in name]
        
        clean_names.append(name)
    
    
    return clean_names

In [20]:
def getQuote(document):
    quotes = []
    
    quotes.append(re.split("[A-Z ]+:", document['full_trans']))
    
    for quote in quotes:
        quote.pop(0)
    
    return quotes

### Look through all unique speakers 
to find names PJ and Alex are referred to and control for errors created by earlier RegEx name extraction

In [56]:
total_names = []
for document in total_dictionaries:
    name = getName(document)
    total_names += name

In [57]:
names_unpacked = list(chain(*total_names))

In [58]:
len(names_unpacked) #total number of speakers

33306

In [59]:
names_unique = set(names_unpacked)

In [60]:
len(names_unique) #number of unique speakers

1023

In [84]:
list(names_unique)[:500]

['THEATER:',
 'TOR:',
 'NATHANIEL WAUGH:',
 'ELLIOTT WILSON:',
 'MARYANNE:',
 'PM:',
 'LINDSEY STONE:',
 'S MANAGER:',
 'TOR EKELAND:',
 'ED MAGEDSON:',
 'AARON SWARTZ:',
 'RICHARD:',
 'HILLARY CLINTON:',
 'VILLAGERS:',
 'VOICEMAIL:',
 'MICHELLE GOMEZ:',
 'LP:',
 'SAMIN:',
 'MARK FORMAN:',
 'OTHERS:',
 'ALEX :',
 'PHIA:',
 'LARGE MAN:',
 'ANDY MILLS:',
 'OTHER MAN:',
 'RYAN HAGEN:',
 'THOMAS OSCAR:',
 'JUNIPER:',
 'JIMMY FALLON:',
 'VIRGIL TEXAS:',
 'DALIS:',
 'THEMEALEX:',
 'JIA:',
 'JO ROTH:',
 'ANDY:',
 'PROTESTER:',
 'SYLVIA:',
 'LUISA:',
 'LINDA MODROWSKI:',
 'LONNY PRICE:',
 'KD:',
 'DAVID SWENSON:',
 'PETER SMITH:',
 'KALILA HOLT:',
 'HOWARD:',
 'ALISA LIBBY:',
 'ALL:',
 'NANCY:',
 'CAPTAIN AMERICA:',
 'GREG KNAUSS:',
 'VIDEO NARRATOR:',
 'PHONE RINGSPJ:',
 'ANDREA NOEL:',
 'JOSEPH:',
 'JOSH LARSON:',
 'KIM ZETTER:',
 'NICK:',
 'POPE:',
 'AUTOMATED VOICE:',
 'SNOW PLOW SHOW:',
 'CARL:',
 'PJ AND ALEX BLUMBERG:',
 'NAZANIN:',
 'KELLY GALLAHER:',
 'JOAN:',
 'PETER CARROLL:',
 'TOM

In [85]:
list(names_unique)[490:]

['CHIARA ATIK:',
 'PJ VOGT:',
 'HOWARD WALTZMAN:',
 'HARVEY:',
 'JON RONSON:',
 'NEWS CLIP:',
 'BRITTANY:',
 'M:',
 'KAREN:',
 'JACOB:',
 'LAURA LEE:',
 'PEOPLE IN BACKGROUND:',
 'LULU MILLER:',
 'CHRIS:',
 'ALEX:',
 'PJ:',
 'MARCO RUBIO:',
 'GENE:',
 'S EMPLOYEE:',
 'WOLF:',
 'SEAN:',
 'BILL:',
 'EMILY KENNEDY:',
 'NATHANIEL:',
 'AB:',
 'JOSHUA:',
 'PAIGE:',
 'ED:',
 'YOUSSEF:',
 'RHYMEFEST:',
 'CAB DRIVER:',
 'ADAM SINGOLDA:',
 'BLIPPI:',
 'BRITTNEY:',
 'HAVN:',
 'KYLIE OCHOA:',
 'TED CRUZ:',
 'ROBIN RADCLIFFE:',
 'SMASH MOUTH:',
 'LYNDON JOHNSON:',
 'TOMMY LOFTUS:',
 'YOUTUBE:',
 'PJ AND SRUTHI:',
 'MOSHE:',
 'GREGORY:',
 'LISA:',
 'CAYDEN:',
 'DAVID FREW:',
 'NEWSCASTER:',
 'HIGINIO OCHOA:',
 'JOE:',
 'LONNY:',
 'TAYLOR VIDEO:',
 'ERIC VALOR:',
 'KASHMIR:',
 'CLAY:',
 'TRISHA:',
 'MATT LIEBER:',
 'MIKE DOUGLAS:',
 'TAMMY:',
 'SHARMA:',
 'VLOGGER:',
 'TPJ:',
 'MELANIE:',
 'POSTAL WORKER:',
 'OFFICER:',
 'MATTHEW KEYS:',
 'JOSH MUMM:',
 'PICKLES VIDEO:',
 'MARIA:',
 'EMILY:',
 'SAL:'

In [61]:
aliases = {'PJ': ['DPI HDTVPJ:', 'PHONE RINGSPJ:', 'M PJ VOGT VIDEO:', 'PJPJ:', 'IM PJ VOGTPJ:', 'PJPJ VOGT:',
                       'PJ VOGT:', 'PJ:', 'PJ :', 'OKPJ:', 'POSTSPJ:', 'SELF DESTRUCT BUTTONPJ:', 'TPJ:'],
                 
           'Alex': ['ALEX:', 'AG:', 'GOLDMAN:', 'ALEX GOLDMAN:', 'GODMAN:', 'GENLTEMENGOLDMAN:']
                    }

## Finalize documents for Mongo

In [62]:
for document in total_dictionaries:
    
    document['by_speaker']['PJ'] = ''
    document['by_speaker']['Alex'] = ''

    names = getName(document)
    quotes = getQuote(document)
    
    if len(names) == len(quotes):

        for index,name in enumerate(names):            
            if name in aliases['PJ']:
                document['by_speaker']['PJ'] += ' ' + quotes[index]

            elif name in aliases['Alex']:
                document['by_speaker']['Alex'] +=  ' ' + quotes[index]

    else:
        print(document['episodeID'])

## Put data in MongoDB

In [63]:
from pymongo import MongoClient

In [64]:
client = MongoClient()

In [65]:
db = client['replyall']

In [None]:
db.replyall.insert_many(total_dictionaries)

In [67]:
df = pd.DataFrame(db.replyall.find())

In [68]:
df.head()

Unnamed: 0,_id,by_speaker,episodeID,full_trans
0,5d55ed03a18334a684f84834,{'PJ': ' Hey Alex. You remember the other da...,/shows/reply-all/xjhx3l/146-summer-hotline,PJ VOGT: Hey Alex.ALEX GOLDMAN: Hey PJ.PJ: You...
1,5d55ed03a18334a684f84835,"{'PJ': '', 'Alex': ' From Gimlet this is Repl...",/shows/reply-all/rnhzlo/145-louder,ALEX GOLDMAN: From Gimlet this is Reply All. I...
2,5d55ed03a18334a684f84836,{'PJ': ' From Gimlet this is Reply All. I’m P...,/shows/reply-all/6nhgol/144-dark-pattern,PJ VOGT: From Gimlet this is Reply All. I’m PJ...
3,5d55ed03a18334a684f84837,"{'PJ': ' From Gimlet, this is Reply All. I’m ...",/shows/reply-all/76hdrj/143-permanent-record,"[THEME MUSIC]PJ VOGT: From Gimlet, this is Rep..."
4,5d55ed03a18334a684f84838,"{'PJ': ' From Gimlet, this is Reply All. I’m ...",/shows/reply-all/gmhr36/142-we-didnt-start-the...,"[THEME MUSIC]PJ: From Gimlet, this is Reply Al..."


In [69]:
df = pd.concat([df.drop(['by_speaker'], axis=1), df['by_speaker'].apply(pd.Series)], axis=1)

In [70]:
df.head()

Unnamed: 0,_id,episodeID,full_trans,PJ,Alex
0,5d55ed03a18334a684f84834,/shows/reply-all/xjhx3l/146-summer-hotline,PJ VOGT: Hey Alex.ALEX GOLDMAN: Hey PJ.PJ: You...,Hey Alex. You remember the other day we tol...,"Hey PJ. I do remember that. Alright. Uh,..."
1,5d55ed03a18334a684f84835,/shows/reply-all/rnhzlo/145-louder,ALEX GOLDMAN: From Gimlet this is Reply All. I...,,From Gimlet this is Reply All. I’m Alex Gold...
2,5d55ed03a18334a684f84836,/shows/reply-all/6nhgol/144-dark-pattern,PJ VOGT: From Gimlet this is Reply All. I’m PJ...,From Gimlet this is Reply All. I’m PJ Vogt. ...,
3,5d55ed03a18334a684f84837,/shows/reply-all/76hdrj/143-permanent-record,"[THEME MUSIC]PJ VOGT: From Gimlet, this is Rep...","From Gimlet, this is Reply All. I’m PJ Vogt....",And I’m Alex Goldman. Hi Phia. Oh. (chuck...
4,5d55ed03a18334a684f84838,/shows/reply-all/gmhr36/142-we-didnt-start-the...,"[THEME MUSIC]PJ: From Gimlet, this is Reply Al...","From Gimlet, this is Reply All. I’m PJ Vogt....",And I’m Alex Goldman. Welcome once again to...


In [71]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 288 entries, 0 to 287
Data columns (total 5 columns):
_id           288 non-null object
episodeID     288 non-null object
full_trans    288 non-null object
PJ            288 non-null object
Alex          288 non-null object
dtypes: object(5)
memory usage: 11.3+ KB


## Move to new workbook for NLP

In [72]:
import pickle

In [73]:
with open('df.pickle', 'wb') as to_write:
    pickle.dump(df, to_write)