In [1]:
import scrapy
from scrapy.crawler import CrawlerProcess
import re

In [2]:
Lore_data = []

In [3]:
# parses transcript links
class Lore(scrapy.Spider):
    name = 'Lore'
    
    def start_requests(self):
        url = 'https://loretranscripts.tumblr.com/page/2'
        # print(urls) # Urls are correct
        yield scrapy.Request(url=url, callback=self.parse_episode)
        
    def parse_episode(self, response):
        transcript_links = response.css(' div.entry > ul >li > a::attr(href)').extract() 
        # print([l for l in transcript_links])
        for link in transcript_links:
            yield response.follow(url=link, 
                                  callback =self.parse_transcript)
    
    def parse_transcript(self, response):  
        current_ep = []
        # Title - done, clean
        title = response.css(' div.entry > h1::text').extract()[0].split('Lore Episode ')[1]
        
        ep_number = re.findall(r'\d+', title)[0] # Not ideal re, but works
        while len(ep_number) < 3:
            ep_number = '0' + ep_number
        current_ep.append(ep_number)
        
        ep_year = re.findall(r'\d+', title)[-1] # Not ideal re, but it works
        current_ep.append(ep_year)
        
        ep_title = re.findall(r'[^\d*:].+?(?=\(Transcript\))', title)
        current_ep.append(ep_title)
        
        
        text = response.css(' div.entry > p').extract()
        current_ep.append(text)
        
        Lore_data.append(current_ep)
        

In [4]:
process = CrawlerProcess()
process.crawl(Lore)
process.start()

2021-03-22 21:48:33 [scrapy.utils.log] INFO: Scrapy 2.4.1 started (bot: scrapybot)
2021-03-22 21:48:33 [scrapy.utils.log] INFO: Versions: lxml 4.6.1.0, libxml2 2.9.10, cssselect 1.1.0, parsel 1.6.0, w3lib 1.22.0, Twisted 20.3.0, Python 3.8.5 (default, Sep  3 2020, 21:29:08) [MSC v.1916 64 bit (AMD64)], pyOpenSSL 19.1.0 (OpenSSL 1.1.1h  22 Sep 2020), cryptography 3.1.1, Platform Windows-10-10.0.19041-SP0
2021-03-22 21:48:33 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.selectreactor.SelectReactor
2021-03-22 21:48:33 [scrapy.crawler] INFO: Overridden settings:
{}
2021-03-22 21:48:33 [scrapy.extensions.telnet] INFO: Telnet Password: 151f51f7b1d2a905
2021-03-22 21:48:33 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.logstats.LogStats']
2021-03-22 21:48:33 [scrapy.middleware] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
 's

In [5]:
import pandas as pd

Lore_df = pd.DataFrame(Lore_data,
                       columns =['Episode', 'Year', 'Title', 'Text'])

In [6]:
Lore_df

Unnamed: 0,Episode,Year,Title,Text
0,130,2019,[ In Plain Sight ],"[<p>tw: none<br></p>, <p>Disclaimer: This tran..."
1,24,2015,[ A Stranger Among Us ],"[<p>tw: death, gore, death of children, diseas..."
2,14,2015,[ The Others ],"[<p><b>tw: </b>death of children, childhood il..."
3,30,2016,[ Deep and Twisted Roots ],"[<p>tw: blood<br></p>, <p>Disclaimer: This tra..."
4,9,2015,[ The Devil on the Roof ],"[<p><b>tw: </b>animal death</p>, <p><b>Disclai..."
5,7,2015,[ In the Woods ],"[<p><b>tw: </b>ghosts, suicide, racism (coloni..."
6,19,2015,[ Bite Marks ],"[<p>tw: death, graveyards, corpses, details of..."
7,32,2016,[ Tampered ],[<p>Disclaimer: This transcript is entirely no...
8,26,2016,[ Brought Back ],"[<p>tw: racism, colonialism, live burial, slav..."
9,5,2015,[ Under Construction ],[<p><b>tw: </b>nothing I can think of! This on...


In [7]:
Lore_df = Lore_df.sort_values(by=['Episode'], ascending=True)
Lore_df.head()

Unnamed: 0,Episode,Year,Title,Text
13,1,2015,[ They Made a Tonic ],"[<p><b>tw: </b>horror, bodily mutilation, bloo..."
10,3,2015,[ The Beast Within ],"[<p><b>tw: </b>murder, rape, death of children..."
9,5,2015,[ Under Construction ],[<p><b>tw: </b>nothing I can think of! This on...
5,7,2015,[ In the Woods ],"[<p><b>tw: </b>ghosts, suicide, racism (coloni..."
4,9,2015,[ The Devil on the Roof ],"[<p><b>tw: </b>animal death</p>, <p><b>Disclai..."


In [8]:
Lore_df.Text[0]

['<p>tw: none<br></p>',
 '<p>Disclaimer: This transcript is entirely non-profit and fan-made. All credit for this content goes to Aaron Mahnke, creator of Lore podcast. It is by a fan, for fans, and meant to make the content of the podcast more accessible to all. Also, there may be mistakes, despite rigorous re-reading on my part. Feel free to point them out, but please be nice!</p>',
 '<p>In early\nwinter of 1822, Captain Samuel Barrett Edes became a hero. He was sailing in\nthe south-east Pacific when he and his crew encountered a Dutch ship that was\nin trouble. Edes managed to save every single one of the Dutch soldiers, and\nthen headed for the city of Batavia, known today as Jakarta, to drop them off\nand see if a reward could be collected. While he waited, he did some shopping.\nNow, Edes wasn’t rich by any stretch of the imagination, but he owned a small\nportion of the ship he sailed and of course, he was expecting a handsome reward\nfor his heroic efforts. With this in mind, 

In [9]:
# First two and last two paragraphs aren't from podcast.  They were written by the transcriber.  Get rid of 'em!
Lore_df['Text'] = Lore_df.Text.map(lambda t: '  '.join(t[2:-2]))
Lore_df.Text[0]



In [10]:
# Use regex to get rid of the html element tags
import re

Lore_df['Text'] = Lore_df.Text.map(lambda t: re.sub(r'<\S+>', '', t))
Lore_df.Text[0]



One last weird thing with the title, then ready for processing!

In [11]:
type(Lore_df.Title[0])

list

In [12]:
Lore_df['Title'] = Lore_df.Title.map(lambda t: t[0])

In [13]:
Lore_df.head()

Unnamed: 0,Episode,Year,Title,Text
13,1,2015,They Made a Tonic,"Hollywood is… obsessed. Sure, we often think o..."
10,3,2015,The Beast Within,Ask anyone in the mental health profession abo...
9,5,2015,Under Construction,"On the south-west corner of Iceland, just to t..."
5,7,2015,In the Woods,Nothing can be as isolating or confining as th...
4,9,2015,The Devil on the Roof,"In March of 2014, a hiker in Lithuania stumble..."


In [14]:
%store Lore_df

Stored 'Lore_df' (DataFrame)
