In [1]:
import requests
import scrapy
import pandas as pd
import re
from bs4 import BeautifulSoup
from scrapy.crawler import CrawlerProcess
from scrapy.utils.response import open_in_browser
# ***** NOTE: you MUST restart the kernal everytime you want to run the spider *****

In [2]:
# Web Crawler Class
class CNBCSpider(scrapy.Spider):
    name = 'CNBC_spider'
    
    def start_requests(self):
        
        url = 'https://www.cnbc.com/'
        yield scrapy.Request(url= url, callback= self.parse_front_page) 
    
    def parse_front_page(self, response):
        #open_in_browser(response)
        headlines = [x for x in response.css('a::attr(href)').extract() if 'cnbc.com/2020/' in x]

        for link in headlines:
            yield response.follow(url=link, callback= self.parse_article)
    
    def parse_article(self, response):
        
        # get article headline
        headline = response.css('h1.ArticleHeader-headline::text').extract()
        
        # get article dates
        date = response.css('time[data-testid="published-timestamp"]::text').extract()
        
        if len(headline) != 0 and len(date) != 0 :
            headlines.append(headline)
            dates.append(date)
     

In [3]:
# initiate arrays to store data from the web crawler 
headlines, descriptions, dates = [], [], []

In [4]:
# Running the Spider 

# initiate a crawler process
process = CrawlerProcess()
    
# tell the process which spider to use
process.crawl(CNBCSpider)

# start the crawling process
process.start()

2020-08-16 21:46:12 [scrapy.utils.log] INFO: Scrapy 1.8.0 started (bot: scrapybot)
2020-08-16 21:46:12 [scrapy.utils.log] INFO: Versions: lxml 4.3.4.0, libxml2 2.9.9, cssselect 1.1.0, parsel 1.5.2, w3lib 1.21.0, Twisted 19.10.0, Python 3.7.3 (v3.7.3:ef4ec6ed12, Mar 25 2019, 16:52:21) - [Clang 6.0 (clang-600.0.57)], pyOpenSSL 19.1.0 (OpenSSL 1.1.1d  10 Sep 2019), cryptography 2.8, Platform Darwin-19.4.0-x86_64-i386-64bit
2020-08-16 21:46:12 [scrapy.crawler] INFO: Overridden settings: {}
2020-08-16 21:46:12 [scrapy.extensions.telnet] INFO: Telnet Password: 066ecd78983a6fdb
2020-08-16 21:46:12 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.logstats.LogStats']
2020-08-16 21:46:12 [scrapy.middleware] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
 'scrapy.downloadermiddlewares.downloadtimeout.Do

2020-08-16 21:46:14 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.cnbc.com/2020/07/30/personal-finance-101-the-complete-guide-to-managing-your-money.html> (referer: https://www.cnbc.com/)
2020-08-16 21:46:14 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.cnbc.com/2020/08/14/facebook-says-apple-refused-to-waive-30percent-fee-on-new-feature.html> (referer: https://www.cnbc.com/)
2020-08-16 21:46:14 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.cnbc.com/2020/08/14/pinterest-to-announce-new-board-member-as-employees-stage-virtual-walkout.html> (referer: https://www.cnbc.com/)
2020-08-16 21:46:14 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.cnbc.com/2020/08/14/apple-removes-fortnite-from-app-store.html> (referer: https://www.cnbc.com/)
2020-08-16 21:46:14 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.cnbc.com/2020/08/14/uber-lyft-threaten-to-suspend-service-in-california-that-may-backfire.html> (referer: https://www.cnbc.com/)

In [7]:
len(parsed_dates), len(headlines)

(53, 53)

In [6]:
# parse Dates and Headlines
parsed_dates = [x[0].split(',')[1].strip() for x in dates]
parsed_headlines = [x[0] for x in headlines]

# build a df out of new scraped data 
df = pd.DataFrame(data={'publish_date':parsed_dates,
                        'headline':parsed_headlines})
# convert date column in datetime objects
df['publish_date'] = pd.to_datetime(df['publish_date'],infer_datetime_format=True)

# set the date column as the df index
df.set_index('publish_date',inplace = True)

In [9]:
# read old data df
df_old = pd.read_csv('cnbc_news.csv',parse_dates=['publish_date'], index_col='publish_date')

# concat new df to old df
combined_df = pd.concat([df_old,df], axis=0)

# remove duplicate headlines if they exist
combined_df.drop_duplicates(subset ="headline", keep = 'last', inplace = True) 

# sort df by date
combined_df.sort_values(by='publish_date', inplace=True)

# export file back to .CSV
combined_df.to_csv('cnbc_news.csv')

In [10]:
# read in and check new combined dataset from wsi_news.csv 
full_df = pd.read_csv('cnbc_news.csv',parse_dates=['publish_date'], index_col='publish_date')
full_df

Unnamed: 0_level_0,headline
publish_date,Unnamed: 1_level_1
2020-02-25,Here's how to reduce the taxes on your Social ...
2020-03-02,Here's why the coronavirus may clobber your re...
2020-03-02,How to build a cash reserve if coronavirus cau...
2020-03-08,"Dow sinks 2,000 points in worst day since 2008..."
2020-03-08,Here's why you probably won't own a flying car...
2020-03-08,"Sen. Ted Cruz, Rep. Paul Gosar, Rep. Doug Coll..."
2020-03-09,Stock market live Monday: Stocks post major lo...
2020-03-09,It takes an average of 5 days for coronavirus ...
2020-03-09,Stocks making the biggest moves midday: Diamon...
2020-03-09,El-Erian: US stock market could end up droppin...
