In [1]:
import scrapy
from scrapy.crawler import CrawlerProcess
import re

In [2]:
radiolab_data = []

In [3]:
# only season 3 has legitimate links
# parses transcript links
class radiolab(scrapy.Spider):
    name = 'radiolab'
    
    def start_requests(self):
        base_url = 'https://www.wnycstudios.org/podcasts/radiolab/podcasts/'
        for i in range(1, 2):
            url = base_url + str(i)
            yield scrapy.Request(url=url, callback=self.parse_episode)
        
    def parse_episode(self, response):
        episode_links = response.xpath('//h1[contains(@class,episode-tease__title)]//a/@href').extract()
        for link in episode_links:
            link = 'https://www.wnycstudios.org' + link
            # print(link)
            yield response.follow(url=link, 
                               callback =self.parse_transcript) 
    

    def parse_transcript(self, response):
        current_ep = []
        
        title = response.xpath('//h3[@class="story__title"]/text()').extract()
        current_ep.append(title)
        
        year = response.xpath('//p[@class="story-metadata__date"]/text()').extract()
        current_ep.append(year)
        
        text = response.xpath('//div[@class="text"]//text()').extract()
        current_ep.append(text)
        
        radiolab_data.append(current_ep)

In [4]:
process = CrawlerProcess()
process.crawl(radiolab)
process.start()

2021-03-12 16:14:16 [scrapy.utils.log] INFO: Scrapy 2.4.1 started (bot: scrapybot)
2021-03-12 16:14:16 [scrapy.utils.log] INFO: Versions: lxml 4.6.1.0, libxml2 2.9.10, cssselect 1.1.0, parsel 1.6.0, w3lib 1.22.0, Twisted 20.3.0, Python 3.8.5 (default, Sep  3 2020, 21:29:08) [MSC v.1916 64 bit (AMD64)], pyOpenSSL 19.1.0 (OpenSSL 1.1.1h  22 Sep 2020), cryptography 3.1.1, Platform Windows-10-10.0.19041-SP0
2021-03-12 16:14:16 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.selectreactor.SelectReactor
2021-03-12 16:14:16 [scrapy.crawler] INFO: Overridden settings:
{}
2021-03-12 16:14:16 [scrapy.extensions.telnet] INFO: Telnet Password: 5faf30c337bc1c13
2021-03-12 16:14:16 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.logstats.LogStats']
2021-03-12 16:14:16 [scrapy.middleware] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
 's

In [5]:
radiolab_data[2]

[['\n    Sight Unseen\n  '],
 ['\n    January 13, 2021\n  '],
 ['\n          ',
  'UNIDENTIFIED PERSON #1: Listener-supported WNYC Studios.',
  '\r\n',
  '(SOUNDBITE OF RADIOLAB INTRO)',
  '\r\n',
  "UNIDENTIFIED PERSON #2: Wait. Wait. You're listening (laughter)...",
  '\r\n',
  'UNIDENTIFIED PERSON #3: OK.',
  '\r\n',
  'UNIDENTIFIED PERSON #2: All right.',
  '\r\n',
  'UNIDENTIFIED PERSON #3: OK.',
  '\r\n',
  'UNIDENTIFIED PERSON #2: All right.',
  '\r\n',
  "UNIDENTIFIED PERSON #3: You're listening...",
  '\r\n',
  'UNIDENTIFIED PERSON #2: Listening...',
  '\r\n',
  'UNIDENTIFIED PERSON #3: ...To RADIOLAB.',
  '\r\n',
  'UNIDENTIFIED PERSON #4: RADIOLAB.',
  '\r\n',
  'UNIDENTIFIED PEOPLE: From WNYC.',
  '\r\n',
  'UNIDENTIFIED PERSON #3: C?',
  '\r\n',
  'UNIDENTIFIED PERSON #2: Yep.',
  '\r\n',
  "LULU MILLER: I'm Lulu Miller. This is RADIOLAB.",
  '\r\n',
  '(SOUNDBITE OF TYPING)',
  '\r\n',
  "LULU MILLER: And today we're going to start with...",
  '\r\n',
  'KAINAZ AMARIA: Lu

In [6]:
import pandas as pd

radiolab_df = pd.DataFrame(radiolab_data,
                         columns = ['Title','Year','Text'])
radiolab_df

Unnamed: 0,Title,Year,Text
0,[\n A Note from Radiolab\n ],"[\n January 7, 2021\n ]",[]
1,[\n Dispatch 14: Covid Crystal Ball\n ],"[\n March 12, 2021\n ]",[]
2,[\n Sight Unseen\n ],"[\n January 13, 2021\n ]","[\n , UNIDENTIFIED PERSON #1: Listene..."
3,[\n More Money Less Problems\n ],"[\n January 15, 2021\n ]","[\n , ANNOUNCER: Listener-supported W..."
4,[\n Smile My Ass\n ],"[\n January 29, 2021\n ]","[\n , Jad:, \r\n, Wait, you're listen..."
5,[\n Post Reports: Four Hours of Insurrectio...,"[\n January 16, 2021\n ]","[\n , Speaker: , Wait, you're listeni..."
6,[\n Red Herring\n ],"[\n February 19, 2021\n ]","[\n , UNIDENTIFIED PERSON #1: Listene..."
7,[\n Facebook's Supreme Court\n ],"[\n February 12, 2021\n ]","[\n , ANNOUNCER: Listener-supported W..."
8,[\n The Ceremony \n ],"[\n February 25, 2021\n ]","[\n , Intro 1:Intro 1:, \r\n, Wait, w..."


In [7]:
# clean title
radiolab_df['Title'] = radiolab_df.Title.map(lambda t: t[0].split('\n')[1])
radiolab_df.Title[:5]

0                   A Note from Radiolab
1        Dispatch 14: Covid Crystal Ball
2                           Sight Unseen
3               More Money Less Problems
4                           Smile My Ass
Name: Title, dtype: object

In [8]:
# clean year
radiolab_df['Year'] = radiolab_df.Year.map(lambda d: d[0].split('\n')[1][-4:])
radiolab_df.Year[-10:]

0    2021
1    2021
2    2021
3    2021
4    2021
5    2021
6    2021
7    2021
8    2021
Name: Year, dtype: object

In [9]:
# clean text
radiolab_df.Text[2]

['\n          ',
 'UNIDENTIFIED PERSON #1: Listener-supported WNYC Studios.',
 '\r\n',
 '(SOUNDBITE OF RADIOLAB INTRO)',
 '\r\n',
 "UNIDENTIFIED PERSON #2: Wait. Wait. You're listening (laughter)...",
 '\r\n',
 'UNIDENTIFIED PERSON #3: OK.',
 '\r\n',
 'UNIDENTIFIED PERSON #2: All right.',
 '\r\n',
 'UNIDENTIFIED PERSON #3: OK.',
 '\r\n',
 'UNIDENTIFIED PERSON #2: All right.',
 '\r\n',
 "UNIDENTIFIED PERSON #3: You're listening...",
 '\r\n',
 'UNIDENTIFIED PERSON #2: Listening...',
 '\r\n',
 'UNIDENTIFIED PERSON #3: ...To RADIOLAB.',
 '\r\n',
 'UNIDENTIFIED PERSON #4: RADIOLAB.',
 '\r\n',
 'UNIDENTIFIED PEOPLE: From WNYC.',
 '\r\n',
 'UNIDENTIFIED PERSON #3: C?',
 '\r\n',
 'UNIDENTIFIED PERSON #2: Yep.',
 '\r\n',
 "LULU MILLER: I'm Lulu Miller. This is RADIOLAB.",
 '\r\n',
 '(SOUNDBITE OF TYPING)',
 '\r\n',
 "LULU MILLER: And today we're going to start with...",
 '\r\n',
 'KAINAZ AMARIA: Lulu?',
 '\r\n',
 "LULU MILLER: I'm just recording your furious typing.",
 '\r\n',
 '(LAUGHTER)',
 '

In [10]:
# What is this supposed to be??????
import re

def cleanstring(text):
    t1= ' '.join(text)
    new_text = re.sub(r'\\[rn(xa0)]*', '', t1) # This doesn't work . . . just have to use for-loop
    return new_text


# mystring = 'There are just so '
radiolab_df['Text'] = radiolab_df.Text.map(lambda t: cleanstring(t))
radiolab_df.Text[2]

'\n           UNIDENTIFIED PERSON #1: Listener-supported WNYC Studios. \r\n (SOUNDBITE OF RADIOLAB INTRO) \r\n UNIDENTIFIED PERSON #2: Wait. Wait. You\'re listening (laughter)... \r\n UNIDENTIFIED PERSON #3: OK. \r\n UNIDENTIFIED PERSON #2: All right. \r\n UNIDENTIFIED PERSON #3: OK. \r\n UNIDENTIFIED PERSON #2: All right. \r\n UNIDENTIFIED PERSON #3: You\'re listening... \r\n UNIDENTIFIED PERSON #2: Listening... \r\n UNIDENTIFIED PERSON #3: ...To RADIOLAB. \r\n UNIDENTIFIED PERSON #4: RADIOLAB. \r\n UNIDENTIFIED PEOPLE: From WNYC. \r\n UNIDENTIFIED PERSON #3: C? \r\n UNIDENTIFIED PERSON #2: Yep. \r\n LULU MILLER: I\'m Lulu Miller. This is RADIOLAB. \r\n (SOUNDBITE OF TYPING) \r\n LULU MILLER: And today we\'re going to start with... \r\n KAINAZ AMARIA: Lulu? \r\n LULU MILLER: I\'m just recording your furious typing. \r\n (LAUGHTER) \r\n LULU MILLER: ...A very busy photo editor. \r\n KAINAZ AMARIA: My name is Kainaz Amaria, and I am the visuals editor at vox.com. \r\n LULU MILLER: Are y

In [15]:
re.findall(r'\\[rn(xa0)]*', radiolab_df.Text[2])

[]