## Scraper code

In [1]:
from libs.article import Article

import re
import requests

from bs4 import BeautifulSoup
from sqlalchemy.orm import sessionmaker

from libs.sqlcreator import create_alchemy_engine
from libs.multi_thread import multi_thread

In [2]:
import string

from collections import Counter
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS

def get_most_common(text):
    exclude = set(string.punctuation)
    text_nopunct = ''.join(ch for ch in text if ch not in exclude)

    words = text_nopunct.lower().split(" ")
    words = [word for word in words if word not in ENGLISH_STOP_WORDS and len(word) > 1]

    return [word for word,count in list(Counter(words).most_common(15))]


with open("npr_article_572945894.txt") as file_hdl:
    article_text = file_hdl.read()
    
title = article_text.split("\n")[0]
url = "npr.org/testing"

most_common = get_most_common(article_text)


In [3]:
def get_text(soup):
    text = ""
    for paragraph in soup.find_all('p'):
        if not paragraph.has_attr('class') and not paragraph.findChildren('b'):
            text += paragraph.get_text() + "\n\n"
            
    return text

In [4]:
def get_npr_urls(soup):
    
    pattern = r'20[\d]{2}/[\d]{1,2}/[\d]{1,2}/[\d]+/'
    
    urls = []
    for link in soup.find_all('a', href=True):
        url = link['href']
        if (url.startswith('https://www.npr.org/')):
            if re.search(pattern, url):
                urls.append(url)
        
    return urls

In [5]:
def get_title(soup):
    title = ""
    #find href with class == title, then get b
    for link in soup.find_all('a'):
        if link.has_attr('class'):
            if link['class'][0] == "title":
                title = link.findChildren('b')[0].get_text()
    
    return title

In [6]:
def get_details(url):
    pattern = r'20[\d]{2}/[\d]{1,2}/[\d]{1,2}/[\d]+/'
    
    matchObject = re.search(pattern, url, flags=0)
    match_split = matchObject[0].split("/")
    date = match_split[0] + "-" + match_split[1] + "-" + match_split[2]
    article_id = match_split[3]
    
    return date, article_id

In [7]:
def scrape_url(url):
    soup = BeautifulSoup(requests.get(url).text, 'html.parser')

    links = get_npr_urls(soup)

    urls = [x for x in get_npr_urls(soup)]

    text = get_text(soup)

    return_dict = {"urls": links}
    if len(text) > 100:
        title = get_title(soup)
        date, article_id = get_details(url)

        return_dict["article"] = Article(title, text, url, get_most_common(text), date)
        
    return return_dict

In [8]:
def is_url_in_db(session, url):
    return session.query(Article).filter(Article.url == url).count() > 0

In [9]:
article_dict = scrape_url("https://www.npr.org/2018/05/04/608323118/in-wake-of-school-shooting-trump-pence-to-address-nra")
article_dict["article"].text

'Click here if you don\'t see the video player in this story.\n\nUpdated at 5:24 p.m. ET\n\nPresident Trump and Vice President Pence spoke to the National Rifle Association at the organization\'s annual meeting in Dallas on Friday — renewing a political partnership that was briefly tested by the deadly school shooting in Parkland, Fla.\n\nTrump said he had been warned by political advisers that appearing before the NRA might not be popular in the current climate. But he brushed those cautions aside.\n\n"You know what I said? \'Bye-bye. Got to get on the plane,\' " Trump told the cheering crowd. "Because we have to do the right thing"\n\nIt\'s the fourth year in a row Trump has addressed the gun rights organization, which was a strong backer of his 2016 campaign. Last year was the first time a sitting president had addressed the group since Ronald Reagan did so in 1983.\n\n"The people in this hall have never taken our freedom for granted," Trump said. "Thanks to your activism and dedica

In [10]:
##############
## 

start_url = "https://www.npr.org/"
traversed_urls = set()

to_traverse = set()
to_traverse.add(start_url)

In [11]:
engine = create_alchemy_engine()

Session = sessionmaker(bind=engine)
session = Session()

In [12]:
for url in session.query(Article.url).distinct():
    traversed_urls.add(url)
len(traversed_urls)

0

In [19]:
count = session.query(Article).count()

while len(to_traverse) > 0 and count < 40000:
    
    print("DB count:", count, "Traversed:", len(traversed_urls), "Queue:", len(to_traverse))
    ## Create a list of URLS to traverse
    curr_traverse = []
    while len(curr_traverse) < 100 and len(to_traverse) > 0:
        url = to_traverse.pop()
        if is_url_in_db(session, url):
            continue
        curr_traverse.append(url)
    
    ## Pass that list to multi-threading
    results = multi_thread(scrape_url, curr_traverse, 10)
    
    ## Multi-threading should return dictionaries mapping to results and to discovered URLs
    traversed_urls.update(curr_traverse)
    
    ## Create class for DB that maintains unique set of to_traverse and traverse
    for result in results:
        result_dict = result[1]
        if "article" in result_dict:
            session.add(result_dict["article"])
        
        to_traverse = to_traverse.union(set(result_dict["urls"]) - traversed_urls)
        
    count = session.query(Article).count()
    session.commit()        
              

DB count: 30004 Traversed: 33904 Queue: 21970
DB count: 30089 Traversed: 34004 Queue: 21997
DB count: 30177 Traversed: 34104 Queue: 21968
DB count: 30264 Traversed: 34204 Queue: 22004
DB count: 30350 Traversed: 34304 Queue: 22050
DB count: 30442 Traversed: 34404 Queue: 22093
DB count: 30530 Traversed: 34504 Queue: 22107
DB count: 30620 Traversed: 34604 Queue: 22118
DB count: 30709 Traversed: 34704 Queue: 22079
DB count: 30798 Traversed: 34804 Queue: 22045
DB count: 30887 Traversed: 34904 Queue: 22094
DB count: 30975 Traversed: 35004 Queue: 22086
DB count: 31061 Traversed: 35104 Queue: 22102
DB count: 31147 Traversed: 35204 Queue: 22166
DB count: 31234 Traversed: 35304 Queue: 22197
DB count: 31315 Traversed: 35404 Queue: 22256
DB count: 31400 Traversed: 35504 Queue: 22273
DB count: 31486 Traversed: 35604 Queue: 22359
DB count: 31579 Traversed: 35704 Queue: 22363
DB count: 31666 Traversed: 35804 Queue: 22442
DB count: 31751 Traversed: 35904 Queue: 22536
DB count: 31841 Traversed: 36004 Q

In [24]:
import pickle

with open('traversed_set.pkl', 'wb') as file_hdl:
    pickle.dump(traversed_urls, file_hdl)
with open('to_traverse.pkl', 'wb') as file_hdl:
    pickle.dump(to_traverse, file_hdl)    

In [14]:
rows = session.query(Article).count()
rows

10060

In [17]:
len(to_traverse)

10341

In [16]:
from sqlalchemy import desc

#for instance in session.query(Article).order_by(desc(Article.date)):
#    print(instance.title, instance.url)

for instance in session.query(Article).order_by(desc(Article.date)).limit(10).offset(10):
    print(instance.title, instance.url)

Campaign Finance Law And The Stormy Daniels Scandal https://www.npr.org/2018/05/05/608723641/campaign-finance-law-and-the-stormy-daniels-scandal
Mueller Probe Update https://www.npr.org/2018/05/05/608723571/mueller-probe-update
A New Church Will Rise After A Shooting Shakes The Community https://www.npr.org/2018/05/05/608723627/a-new-church-will-rise-after-a-shooting-shakes-the-community
Exploring 'The Heritage' Of Black Athletes https://www.npr.org/2018/05/05/608723592/exploring-the-heritage-of-black-athletes
Saturday Sports: NBA Semifinals, Kentucky Derby https://www.npr.org/2018/05/05/608723655/saturday-sports-nba-semifinals-kentucky-derby
Gun Control Activists Protest NRA In Dallas https://www.npr.org/2018/05/05/608723620/gun-control-activists-protest-nra-in-dallas
Zora Neale Hurston's 'Barracoon' Gets Published, More Than 60 Years Later https://www.npr.org/2018/05/05/608723606/zora-neale-hurstons-barracoon-gets-published-more-than-60-years-later
Trump Administration Talks Trade Wi