In [1]:
import requests
import scrapy
import pandas as pd
import re
from bs4 import BeautifulSoup
from scrapy.crawler import CrawlerProcess
# ***** NOTE: you MUST restart the kernal everytime you want to run the spider *****

In [2]:
# Web Crawler Class
class WSJSpider(scrapy.Spider):
    name = 'WSJ_spider'
    
    def start_requests(self):
        
        url = 'https://www.wsj.com/'
        yield scrapy.Request(url= url, callback= self.login) 
    
    def login(self, response):
        return scrapy.FormRequest.from_response(response,
            formdata={'username': 'michaelblau@wustl.edu', 'password': 'Magic1998'},
            callback=self.after_login)
    
    def after_login(self, response):
        headlines_links = response.css('h3.WSJTheme--headline--19_2KfxG > a::attr(href)').extract()
        
        for link in headlines_links:
            if 'wsj' in link:
                yield response.follow(url=link, callback= self.parse_article)
    
    def parse_article(self, response):
        
        # get article headline
        headline = response.css('h1.wsj-article-headline::text').extract()
        if len(headline) == 0:
            headline = response.css('h1.bigTop__hed::text').extract()
        headline = headline[0].strip()
        headlines.append(headline)
        
        # get article description
        description = response.css('h2.sub-head::text').extract()
        if len(description) == 0:
            description = None
            descriptions.append(description)
        else:
            descriptions.append(description[0])
        
        # get article date
        date = response.css('time.article__timestamp::text').extract()
        if len(date)==0:
            date = None
            article_dates.append(date)
        else:
            article_dates.append(date[0].strip())
            
        # get article authors
        article_authors.append(response.css('ul.author-info > div.info-name::text').extract())   

In [3]:
# initiate arrays to store data from the web crawler 
headlines, descriptions, article_authors , article_dates, descriptions = [], [], [], [], []

In [4]:
# Running the Spider 

# initiate a crawler process
process = CrawlerProcess()
    
# tell the process which spider to use
process.crawl(WSJSpider)

# start the crawling process
process.start()

2020-02-22 15:49:03 [scrapy.utils.log] INFO: Scrapy 1.8.0 started (bot: scrapybot)
2020-02-22 15:49:03 [scrapy.utils.log] INFO: Versions: lxml 4.3.4.0, libxml2 2.9.9, cssselect 1.1.0, parsel 1.5.2, w3lib 1.21.0, Twisted 19.10.0, Python 3.7.3 (v3.7.3:ef4ec6ed12, Mar 25 2019, 16:52:21) - [Clang 6.0 (clang-600.0.57)], pyOpenSSL 19.1.0 (OpenSSL 1.1.1d  10 Sep 2019), cryptography 2.8, Platform Darwin-19.2.0-x86_64-i386-64bit
2020-02-22 15:49:03 [scrapy.crawler] INFO: Overridden settings: {}
2020-02-22 15:49:03 [scrapy.extensions.telnet] INFO: Telnet Password: b92eb9cf74cdefe1
2020-02-22 15:49:03 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.logstats.LogStats']
2020-02-22 15:49:03 [scrapy.middleware] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
 'scrapy.downloadermiddlewares.downloadtimeout.Do

2020-02-22 15:49:09 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.wsj.com/articles/weinstein-jury-deadlocked-on-predatory-sexual-assault-11582315715?mod=hp_listb_pos2> (referer: https://www.wsj.com/?username=michaelblau%40wustl.edu&password=Magic1998)
2020-02-22 15:49:09 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.wsj.com/articles/maersk-warns-of-coronavirus-blow-as-profits-slump-11582185087?mod=hp_minor_pos15> (referer: https://www.wsj.com/?username=michaelblau%40wustl.edu&password=Magic1998)
2020-02-22 15:49:09 [scrapy.downloadermiddlewares.redirect] DEBUG: Redirecting (302) to <GET https://blogs.wsj.com/cio/2020/02/21/can-democracy-and-free-markets-survive-in-the-coming-age-of-ai/?guid=BL-CIOB-14876&mod=hp_minor_pos4&dsk=y> from <GET https://blogs.wsj.com/articles.php?guid=BL-CIOB-14876&mod=hp_minor_pos4&dsk=y>
2020-02-22 15:49:09 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.wsj.com/articles/david-ruder-weathered-black-monday-as-sec-chairman

2020-02-22 15:49:14 [scrapy.core.engine] INFO: Spider closed (finished)


In [5]:
def convertToDatetime(s):
    """
    Input
        String: raw date text from a WSJ article
    Output
        Datetime object
    """
    if isinstance(s,str):
        # use a regex to parse out date if it exists, otherwise return a None value
        pattern = re.compile(r'([\w]{3})\. ([\d]{1,2}), ([\d]{4})')
        match = pattern.search(s)
        if match is not None:
            group = match.groups()
            return '/'.join(group)
        else:
            return None
    else:
        return None

In [6]:
# build a df out of new scraped data 
df = pd.DataFrame(data={'publish_date':article_dates,
                        'headline':headlines,
                        'description': descriptions})
# convert date column in datetime objects
df['publish_date'] = df['publish_date'].apply(lambda s: convertToDatetime(s))
df['publish_date'] = pd.to_datetime(df['publish_date'],infer_datetime_format=True)

# set the date column as the df index
df.set_index('publish_date',inplace = True)

In [7]:
# read old data df
df_old = pd.read_csv('wsj_news.csv',parse_dates=['publish_date'], index_col='publish_date')

# concat new df to old df
combined_df = pd.concat([df_old,df], axis=0)

# remove duplicate headlines if they exist
combined_df.drop_duplicates(subset ="headline", keep = 'first', inplace = True) 

# sort df by date
combined_df.sort_values(by='publish_date', inplace=True)

# export file back to .CSV
combined_df.to_csv('wsj_news.csv')

In [9]:
# read in and check new combined dataset from wsi_news.csv 
full_df = pd.read_csv('wsj_news.csv',parse_dates=['publish_date'], index_col='publish_date')
full_df

Unnamed: 0_level_0,headline,description
publish_date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-02-12,Omnicom’s Hearts & Science Expands Marketing T...,The agency projects the group will account for...
2020-02-13,"Instagram Targets More Funding, Ad Revenue-Sha...",The Facebook-owned company looks to pay more f...
2020-02-17,You’re Getting Married Again. Should You Combi...,Many experts say it’s OK to keep things separa...
2020-02-17,"People Don’t Save Enough for Emergencies, but ...",Small changes to company retirement plans can ...
2020-02-19,David Ruder Weathered Black Monday as SEC Chai...,Former Northwestern University law dean sought...
2020-02-19,U.S. House Subcommittee Scrutinizes Accounting...,Rep. Sherman looks to make accounting rules a ...
2020-02-20,Maersk Warns of Lower Earnings From Coronaviru...,The world’s biggest container ship operator sa...
2020-02-20,More Manufacturers Bet on Simulation Software,Tool allows auto makers and others to assess h...
2020-02-20,Swiss Regulator Censures Julius Baer in FIFA M...,The Swiss bank failed to do enough to identify...
2020-02-20,New Mexico Sues Google Over Children’s Data Pr...,State alleges tech company’s education platfor...
