In [1]:
import scrapy
from scrapy.crawler import CrawlerProcess

In [2]:
YWA_data = []

In [3]:
# only season 3 has legitimate links
# parses transcript links
class YWA(scrapy.Spider):
    name = 'You\'re Wrong About'
    
    def start_requests(self):
        # This isn't the official podcast website, but it is what the official website links to.
        url = 'https://www.buzzsprout.com/1112270'
        yield scrapy.Request(url=url, callback=self.parse_episode)
        
    def parse_episode(self, response):
        episode_links = response.xpath('//a[@class="episode-list--link flex"]/@href').extract()
        for link in episode_links:
            link = 'https://www.buzzsprout.com' + link
            yield response.follow(url=link, 
                               callback =self.parse_transcript) 

    def parse_transcript(self, response):
        current_ep = []
        
        # episode = response.xpath('').extract()
        # current_ep.append(episode)
        
        # title = response.css(' h1.episode__title > text').extract()
        title = response.xpath('//h1[@class="episode__title"]/text()').extract()[0]
        current_ep.append(title)
               
        year = response.xpath('//div[@class="episode__details"]/span/text()').extract()[0]
        current_ep.append(year)
        
        text = response.xpath('//div[contains(@*, "transcript")]/*//text()').extract()
        text = '  '.join(text)
        current_ep.append(text)
        
        YWA_data.append(current_ep)

In [4]:
process = CrawlerProcess()
process.crawl(YWA)
process.start()

2021-05-01 19:36:37 [scrapy.utils.log] INFO: Scrapy 2.4.1 started (bot: scrapybot)
2021-05-01 19:36:37 [scrapy.utils.log] INFO: Versions: lxml 4.6.1.0, libxml2 2.9.10, cssselect 1.1.0, parsel 1.6.0, w3lib 1.22.0, Twisted 20.3.0, Python 3.8.5 (default, Sep  3 2020, 21:29:08) [MSC v.1916 64 bit (AMD64)], pyOpenSSL 19.1.0 (OpenSSL 1.1.1h  22 Sep 2020), cryptography 3.1.1, Platform Windows-10-10.0.19041-SP0
2021-05-01 19:36:37 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.selectreactor.SelectReactor
2021-05-01 19:36:37 [scrapy.crawler] INFO: Overridden settings:
{}
2021-05-01 19:36:37 [scrapy.extensions.telnet] INFO: Telnet Password: dd0fc35ade888dc0
2021-05-01 19:36:37 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.logstats.LogStats']
2021-05-01 19:36:37 [scrapy.middleware] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
 's

In [5]:
print(len(YWA_data))
YWA_data[:5]

25


[["Bonus: The President's Physical Fitness Test",
  'October 22, 2020',
  "Sarah:   You're very white too. You're like a little Victorian ghost boy.  Mike:   Welcome to   You're Wrong About   the podcast that occasionally spins off into other podcasts.  Sarah:   \xa0I think people have been describing this as an   Avengers   type situation, which I support because I want to be   Ant Man  .  Mike:   \xa0If I had to pick an Avenger for you, it would absolutely be Ant Man.\xa0  Sarah:   Okay. And so you have created a new podcast called   Maintenance Phase  . And which Avenger are you and what Avenger are you working with?\xa0  Mike:   I think I would probably be Captain Marvel because she's kind of introverted and away from everything else. And then Aubrey, my co-host on Maintenance Phase, would absolutely be Captain America because she's very just pure and like the driven snow.  Sarah:   Yeah. I think those two are appropriate because of my extensive experience of the Avengers, which is

In [6]:
import pandas as pd

YWA_df = pd.DataFrame(YWA_data, 
                  columns=['Title','Year','Text'])
YWA_df

Unnamed: 0,Title,Year,Text
0,Bonus: The President's Physical Fitness Test,"October 22, 2020",Sarah: You're very white too. You're like a ...
1,"Bonus: The ""Twinkie Defense""","November 19, 2020",
2,The Electoral College,"November 16, 2020",
3,The Newsboys' Strike of 1899 (Part 1),"November 23, 2020",Sarah : I feel like there's not enough attent...
4,Princess Diana Part 4: The Divorce,"November 02, 2020",
5,"Bonus: Why Are Dads on ""Terminator 2""","October 29, 2020","Alex: Hello, Sarah Marshall. Sarah: Hell..."
6,Halloween Re-Release: The Exxon Valdez Oil Spill,"October 31, 2020",Mike: I just want someday for there to be a ...
7,Princess Diana Part 5: The Crash,"November 09, 2020",Sarah: They just feel that she has stolen th...
8,The Newsboys' Strike of 1899 (Part 2),"November 30, 2020",Sarah: You're giving me a Princess Diana; I ...
9,Losing Relatives to Fox News,"December 07, 2020",Sarah: There's just something that Americans...


In [7]:
# a bunch of the episodes don't have transcripts - get rid of them
def make_null(text):
    if len(text) == 0:
        text += 'null'
    return(text)

YWA_df['Text'] = YWA_df.Text.map(lambda t: make_null(t))
YWA_df = YWA_df[~YWA_df.Text.str.contains('null')]
YWA_df

Unnamed: 0,Title,Year,Text
0,Bonus: The President's Physical Fitness Test,"October 22, 2020",Sarah: You're very white too. You're like a ...
3,The Newsboys' Strike of 1899 (Part 1),"November 23, 2020",Sarah : I feel like there's not enough attent...
5,"Bonus: Why Are Dads on ""Terminator 2""","October 29, 2020","Alex: Hello, Sarah Marshall. Sarah: Hell..."
6,Halloween Re-Release: The Exxon Valdez Oil Spill,"October 31, 2020",Mike: I just want someday for there to be a ...
7,Princess Diana Part 5: The Crash,"November 09, 2020",Sarah: They just feel that she has stolen th...
8,The Newsboys' Strike of 1899 (Part 2),"November 30, 2020",Sarah: You're giving me a Princess Diana; I ...
9,Losing Relatives to Fox News,"December 07, 2020",Sarah: There's just something that Americans...
11,The Anti-Vaccine Movement,"February 01, 2021","Sarah Marshall 0:00 You know, people compl..."
13,Tipper Gore vs. Heavy Metal: The Case Against ...,"February 08, 2021",Sarah Marshall 0:00 The internet is like a...
14,Tipper Gore vs. Heavy Metal: The Hearing,"February 15, 2021",Sarah: I think that if we were going to have...


In [8]:
import re

YWA_df['Year'] = YWA_df.Year.map(lambda d: re.findall(r'\d{4}', d)[0])
YWA_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  YWA_df['Year'] = YWA_df.Year.map(lambda d: re.findall(r'\d{4}', d)[0])



Unnamed: 0,Title,Year,Text
0,Bonus: The President's Physical Fitness Test,2020,Sarah: You're very white too. You're like a ...
3,The Newsboys' Strike of 1899 (Part 1),2020,Sarah : I feel like there's not enough attent...
5,"Bonus: Why Are Dads on ""Terminator 2""",2020,"Alex: Hello, Sarah Marshall. Sarah: Hell..."
6,Halloween Re-Release: The Exxon Valdez Oil Spill,2020,Mike: I just want someday for there to be a ...
7,Princess Diana Part 5: The Crash,2020,Sarah: They just feel that she has stolen th...


In [9]:
# reorganize
titles = YWA_df.pop('Title')
years = YWA_df.pop('Year')
text = YWA_df.pop('Text')
YWA_df.head()

0
3
5
6
7


In [10]:
YWA_df['Episode'] = ''
YWA_df['Year'] = years
YWA_df['Title'] = titles
YWA_df['Text'] = text

YWA_df.sample(10)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  YWA_df['Episode'] = ''

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  YWA_df['Year'] = years

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  YWA_df['Title'] = titles

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the

Unnamed: 0,Episode,Year,Title,Text
24,,2021,Vanessa Williams Part 1: Becoming Miss America,"Sarah Marshall: Oh, my God. It's like a cat ..."
11,,2021,The Anti-Vaccine Movement,"Sarah Marshall 0:00 You know, people compl..."
8,,2020,The Newsboys' Strike of 1899 (Part 2),Sarah: You're giving me a Princess Diana; I ...
23,,2021,"""Political Correctness""","Also, is free speech a right in the Constituti..."
22,,2021,The O.J. Simpson Trial: From the Mixed-Up File...,Sarah: Economically like it was once necessa...
6,,2020,Halloween Re-Release: The Exxon Valdez Oil Spill,Mike: I just want someday for there to be a ...
3,,2020,The Newsboys' Strike of 1899 (Part 1),Sarah : I feel like there's not enough attent...
13,,2021,Tipper Gore vs. Heavy Metal: The Case Against ...,Sarah Marshall 0:00 The internet is like a...
18,,2021,The O.J. Simpson Trial: The DeLorean Detour,Sarah: Yeah. Instead of MLMs in the early ei...
0,,2020,Bonus: The President's Physical Fitness Test,Sarah: You're very white too. You're like a ...


In [11]:
# Titles are nice and clean, will keep whole date
# get rid of rows without text  
YWA_df = YWA_df[YWA_df.Text != ''].reset_index(drop=True)

YWA_df

Unnamed: 0,Episode,Year,Title,Text
0,,2020,Bonus: The President's Physical Fitness Test,Sarah: You're very white too. You're like a ...
1,,2020,The Newsboys' Strike of 1899 (Part 1),Sarah : I feel like there's not enough attent...
2,,2020,"Bonus: Why Are Dads on ""Terminator 2""","Alex: Hello, Sarah Marshall. Sarah: Hell..."
3,,2020,Halloween Re-Release: The Exxon Valdez Oil Spill,Mike: I just want someday for there to be a ...
4,,2020,Princess Diana Part 5: The Crash,Sarah: They just feel that she has stolen th...
5,,2020,The Newsboys' Strike of 1899 (Part 2),Sarah: You're giving me a Princess Diana; I ...
6,,2020,Losing Relatives to Fox News,Sarah: There's just something that Americans...
7,,2021,The Anti-Vaccine Movement,"Sarah Marshall 0:00 You know, people compl..."
8,,2021,Tipper Gore vs. Heavy Metal: The Case Against ...,Sarah Marshall 0:00 The internet is like a...
9,,2021,Tipper Gore vs. Heavy Metal: The Hearing,Sarah: I think that if we were going to have...


In [12]:
# How does the text look?
YWA_df.Text[2][:1000]

"Alex:   Hello, Sarah Marshall.\xa0  Sarah:   Hello, Alex Steed.\xa0  Alex:   What are we going to talk about today?\xa0  Sarah:   We're going to talk about Terminator 2: Judgment Day.\xa0  Alex:   What is Why Are Dads? about? \xa0  Sarah:   Why Are Dads? is about trying to understand our relationships with our dads and our culture's relationship with what dads are, by watching lots of movies and talking to our friends.  Alex:   Is there anything special by way of guests in this episode?\xa0  Sarah:   There is. We have my You’re Wrong About co-host Michael Hobbes guesting on this episode. It's really cool.\xa0  Alex:   Why did Terminator 2 come up as what we should talk about with Michael Hobbes?  Sarah:   Well it was his suggestion, but it's a movie that we all independently love, and I think have a strong, emotional connection to.  Alex:   Do you think people know that they have strong, emotional connections to Terminator 2, or that they just love it?  Sarah:   I think people in Amer

In [13]:
# get rid of weird \xa0-s and various types of timestamps
import re

def nofiller(text):
	timestamp = r'[ ( ]*\d{,2}:\d{2}[ ) ]*'
	t1 = re.sub(timestamp, '', text)
	t2 = t1.replace(u'\xa0  \xa0', u':')
	t3 = t2.replace(u'\xa0',u'')
	return t3

nofiller(YWA_df.Text[4][:1000])

"Sarah:   They just feel that she has stolen their job and is better at the thing that they're supposed to be best at. And she's not even from here.  Welcome to   You’re Wrong About  , where every story has to come to an end.  Mike:   Oh, you're already putting us in like an elegiac mood for this one.  Sarah:   God, I've been saying that word “elegiac” in my head. And I have genuinely no idea which one of us is right.  Mike:   Well we all know that I am perfect at pronouncing things.  Sarah:   That's true.  Mike:   I think history tells us.  Sarah:   It's probably you, yeah.  Mike:   I am Michael Hobbs, I'm a reporter for the Huffington Post.  Sarah:   I'm Sarah Marshall. I'm working on a book about the Satanic Panic.  Mike:   And if you want to support the show and hear cute bonus episodes, you can go to   patreon.com/yourewrongabout  .  Sarah:   Or perhaps you were cute enough already. You don't need to do any of those things. And we support that also.  Mike:   And today we ar"

In [14]:
YWA_df['Text'] = YWA_df.Text.map(nofiller)
YWA_df.head()

Unnamed: 0,Episode,Year,Title,Text
0,,2020,Bonus: The President's Physical Fitness Test,Sarah: You're very white too. You're like a ...
1,,2020,The Newsboys' Strike of 1899 (Part 1),Sarah : I feel like there's not enough attent...
2,,2020,"Bonus: Why Are Dads on ""Terminator 2""","Alex: Hello, Sarah Marshall. Sarah: Hello..."
3,,2020,Halloween Re-Release: The Exxon Valdez Oil Spill,Mike: I just want someday for there to be a ...
4,,2020,Princess Diana Part 5: The Crash,Sarah: They just feel that she has stolen th...


In [15]:
def speech_finder(text, hosts, host_formatting, last_names):
    host_dict = {}
    for h in hosts:
        speech = ''
        current_host = h + host_formatting
        other_hosts = [(x + host_formatting) for x in hosts if x != h]
        
        lookahead = '|'.join(other_hosts)
        regex = current_host + '.*?(?=' + lookahead + ')'        
        speech = re.findall(regex, text)
        
        speech = [(re.sub(current_host, '', s)).strip() for s in speech]
        host_dict[h] = speech
    
    hosts = '|'.join((hosts + last_names))
    text = re.sub(hosts, 'HOST', text)
    return host_dict, text

In [16]:
YWA_df['Anonymized_text'] = YWA_df.Text.map(lambda t: speech_finder(t, ['Mike','Sarah'], ':   ', ['Hobbes','Marsall'])[1])
YWA_df['Speaker_parts'] = YWA_df.Text.map(lambda t: speech_finder(t, ['Mike','Sarah'], ':   ', ['Hobbes','Marsall'])[0])

In [17]:
YWA_df.Text[0]

"Sarah:   You're very white too. You're like a little Victorian ghost boy.  Mike:   Welcome to   You're Wrong About   the podcast that occasionally spins off into other podcasts.  Sarah:   I think people have been describing this as an   Avengers   type situation, which I support because I want to be   Ant Man  .  Mike:   If I had to pick an Avenger for you, it would absolutely be Ant Man.  Sarah:   Okay. And so you have created a new podcast called   Maintenance Phase  . And which Avenger are you and what Avenger are you working with?  Mike:   I think I would probably be Captain Marvel because she's kind of introverted and away from everything else. And then Aubrey, my co-host on Maintenance Phase, would absolutely be Captain America because she's very just pure and like the driven snow.  Sarah:   Yeah. I think those two are appropriate because of my extensive experience of the Avengers, which is seeing the first movie and the last movie and completely losing interest in the middle, t

In [18]:
%store YWA_df

Stored 'YWA_df' (DataFrame)


In [19]:
YWA_df.Speaker_parts[2]

{'Mike': ["Oh God. Although there's an internet rumor that Alex Steed is actually a voice that I'm doing. I plan to nurture this conspiracy theory as much as possible.  Alex:   Ideally this will break it, but who knows?",
  "Yeah. As with most of Sarah's references, I just try to laugh and move along. That's all I got.",
  'Oh just like me and Alex! Yeah, it is true.',
  "You're in Maine. We both slept with the same ladies.",
  'We need like a little transition music, like, Oh, she did it! We got there!',
  "I'm Michael Hobbes. I'm a reporter for HuffPost and I co-host a podcast called You're Wrong About, with some strange woman I met on the internet.",
  "Oh yeah. I'm forgetting about this because it just launched this week. So I don't even know what I'm supposed to promote anymore. And I also have a new podcast called,   Maintenance Phase  , with another extremely cool lady that I met from the internet. So go check that out.  Alex:   I've learned so much from this podcast just as a l