# Data fetching
This module is used to fetch articles from newspaper websites

In [None]:
import main

In [None]:
import os
from subprocess import call, check_output
import glob
import re

from bs4 import BeautifulSoup

from log import writeLog
from timer import timer_start, timer_stop

In [None]:
def page_to_file(page, newspaper, name):
    """Saves the html page in a file"""
    if not os.path.isdir(os.path.join("..", "data", "webpages", newspaper)):
        os.makedirs(os.path.join("..", "data", "webpages", newspaper))
    f = open(os.path.join("..", "data", "webpages", newspaper, name + ".html"), "w")
    f.write(page)
    f.close()
    writeLog("debug", "webpages/{}/{}.html saved!".format(newspaper, name))

In [None]:
def article_to_file(page, newspaper, name):
    """Saves the html page in a file"""
    if not os.path.isdir(os.path.join("..", "data", "articles", newspaper)):
        os.makedirs(os.path.join("..", "data", "articles", newspaper))
    f = open(os.path.join("..", "data", "articles", newspaper, name + ".txt"), "w")
    f.write(page)
    f.close()
    writeLog("debug", "articles/{}/{}.txt saved!".format(newspaper, name))

In [None]:
def url_to_name(newspaper, url):
    """Make an article name with url"""
    name = url
    if newspaper == "washingtonpost":
        name = "/".join(name.split("/")[:-1])
        if name.split("/")[-1].rstrip("0123456789abcdef") == "":
            name = "/".join(name.split("/")[:-1])
    name = name.rstrip("0123456789-_/")
    name = name.split("/")[-1].split(".")[0].replace("-", "_")
    return name

In [None]:
def fetch_html_page(url, save=False):
    """Fetch the whole html page from the link"""
    newspaper = url.split('.com')[0].split(".")[-1]
    if not os.path.isdir(os.path.join("..", "data", "webpages", newspaper)):
        os.makedirs(os.path.join("..", "data", "webpages", newspaper))
    name = url_to_name(newspaper, url)
    fname = os.path.join("..", "data", "webpages", newspaper, name + ".html")
    if os.path.isfile(fname):
        writeLog("warn", "File exists: " + fname)
        return "", "", ""
    timer_start("Fetching {}".format(url), True)
    page = check_output(["curl", "-s", url]).decode("utf-8")
    timer_stop("Fetching {}".format(url))
    if save:
        page_to_file(page, newspaper, name)
    return page, newspaper, name

In [None]:
def read_article(newspaper, name):
    """Opens the file and return its content"""
    f = open(os.path.join("..", "data", "webpages", newspaper, name + ".html"), "r")
    lines = f.readlines()
    f.close()
    return "\n".join(lines)

In [None]:
def extract_text_html_nytimes(text):
    """Retrieve the article from the html page"""
    # Parse the html
    soup = BeautifulSoup(text, "html.parser")
    # Remove scripts and style
    for nope in soup(["script", "style"]):
        nope.extract()
    ## Extract and join the text ##
    article_lines = soup.find_all("p", attrs={"class": "story-body-text story-content"})
    clean_lines = [l.get_text().strip() for l in article_lines]
    clean_lines = [l for l in clean_lines if l != ""]
    # Assemble
    clean_text = "\n".join(clean_lines)
    # Returns
    return clean_text

In [None]:
def extract_text_html_usatoday(text):
    """Retrieve the article from the html page"""
    # Parse the html
    soup = BeautifulSoup(text, "html.parser")
    # Remove scripts and style
    for nope in soup(["script", "style"]):
        nope.extract()
    ## Extract and join the text ##
    article_lines = soup.find_all('p', attrs={"class" : "p-text"})
    clean_lines = [l.get_text().strip() for l in article_lines]
    clean_lines = [l for l in clean_lines if l != ""]
    # Remove unwanted lines
    lines_to_remove = ["Related:", "Read more:"]
    clean_lines = [l for l in clean_lines if l not in lines_to_remove]
    # Assemble
    clean_text = "\n".join(clean_lines)
    # Junk chunks
    clean_text = clean_text.replace("USA TODAY", "")
    # Returns
    return clean_text

In [None]:
def extract_text_html_washingtonpost(text):
    """Retrieve the article from the html page"""
    # Parse the html
    soup = BeautifulSoup(text, "html.parser")
    # Remove scripts and style
    for nope in soup(["script", "style"]):
        nope.extract()
    ## Extract and join the text ##
    article_lines = soup.find_all('article')[0].find_all('p', attrs={"class" : ""})
    clean_lines = [l.get_text().strip() for l in article_lines]
    clean_lines = [l for l in clean_lines if l != ""]
    # Assemble
    clean_text = "\n".join(clean_lines)
    # Returns
    return clean_text

In [None]:
def extract_text_html(text, newspaper):
    """Retrieve the article from the html page, depends on the newpaper"""
    handlers = {"nytimes": extract_text_html_nytimes,
                "usatoday": extract_text_html_usatoday,
                "washingtonpost": extract_text_html_washingtonpost}
    if newspaper in handlers:
        return handlers[newspaper](text)
    else:
        writeLog("error", "Unhandled newspaper: {}".format(newspaper))
        return ""

In [None]:
def test_article(newspaper, name):
    page = read_article(newspaper, name)
    art = extract_text_html(page, newspaper)
    print(art)

In [None]:
def all_articles_to_files(newspaper, verbose=False):
    webpages = glob.glob("../data/webpages/{}/*.html".format(newspaper))

    for wp in webpages:
        name = os.path.splitext(os.path.split(wp)[1])[0]
        page = read_article(newspaper, name)
        try:
            art = extract_text_html(page, newspaper)
        except Exception as e:
            writeLog("error", "{}/{} error: {}".format(newspaper, name, e))
            writeLog("debug", page)
            return None
        article_to_file(art, newspaper, name)
        if verbose:
            print(art + "\n\n-----\n")

In [None]:
def clean_article(art):
    art = re.sub("[,.…:“”—]", "", art)
    art = re.sub("['’’\"]", " ", art)
    return art

In [None]:
def all_articles_to_corpus():
    newspapers = os.listdir("../data/articles")
    corpus = open("../data/articles.txt", "w")
    sources = open("../data/sources.txt", "w")

    for newspaper in newspapers:
        articles = glob.glob("../data/articles/{}/*.txt".format(newspaper))
        
        for article in articles:
            art = " ".join(open(article).read().splitlines())
            corpus.write(clean_article(art))
            corpus.write("\n")
            sources.write(article[len("../data/articles/"):-4] + "\n")
    
    corpus.close()
    sources.close()

In [None]:
# fetch_html_page("https://www.nytimes.com/2017/07/26/us/politics/trump-transgender-military.html", True)
# fetch_html_page("https://www.nytimes.com/2017/07/26/us/politics/white-house-aides-think-trump-will-let-sessions-stay-for-now.html", True)
# fetch_html_page("https://www.nytimes.com/2017/07/26/us/politics/lisa-murkowski-health-care.html", True)
# fetch_html_page("https://www.nytimes.com/2017/07/27/us/politics/scaramucci-leaks-priebus-white-house-justice.html", True)
# fetch_html_page("https://www.nytimes.com/2017/07/26/us/politics/trump-transgender-military.html", True)
# fetch_html_page("https://www.nytimes.com/2017/07/27/world/middleeast/isis-yazidi-women-rape-iraq-mosul-slavery.html", True)
# fetch_html_page("https://www.nytimes.com/2017/07/27/opinion/wow-trump-cant-terminate.html", True)
# fetch_html_page("https://www.nytimes.com/2017/07/26/us/politics/senate-rejects-repealing-obamacare-without-replacement-trump.html", True)
# fetch_html_page("https://www.nytimes.com/2017/07/26/opinion/white-house-lies-cia.html", True)
# fetch_html_page("https://www.nytimes.com/2017/07/26/world/asia/dolam-plateau-china-india-bhutan.html", True)
# fetch_html_page("https://www.nytimes.com/2017/07/26/opinion/donald-trumps-assault-on-jeff-sessions.html", True)

In [None]:
# all_articles_to_files("nytimes", True)

In [None]:
# fetch_html_page("https://www.usatoday.com/story/news/world/2017/08/02/cancun-playa-del-carmen-tulum-violent-crime-encroaching/527247001/", True)
# fetch_html_page("https://www.usatoday.com/story/news/world/2017/08/02/trumps-scotland-golf-course-neighbors-decry-bully-trying-expand/487073001/", True)
# fetch_html_page("https://www.usatoday.com/story/life/music/2017/08/02/kanye-west-files-10-million-lawsuit-over-canceled-saint-pablo-tour/531529001/", True)
# fetch_html_page("https://www.usatoday.com/story/news/politics/2017/08/02/president-trump-signs-new-russia-sanctions-bill/532442001/", True)
# fetch_html_page("http://college.usatoday.com/2017/08/01/college-women-in-tech-were-encountering-sexism-already/", True)
# fetch_html_page("https://www.usatoday.com/story/opinion/2017/08/02/pass-gop-tax-cuts-thanksgiving-get-ready-speaker-nancy-pelosi/529438001/", True)
# fetch_html_page("https://www.usatoday.com/story/opinion/2017/08/01/set-health-record-straight-republicans-helped-craft-obamacare-ross-baker-column/523952001/", True)
# fetch_html_page("https://www.usatoday.com/story/opinion/2017/08/02/donald-trump-defenders-nothing-offensive-except-anthony-scaramuccis-mouth-kirsten-powers-column/530087001/", True)

In [None]:
# all_articles_to_files("usatoday", True)

In [None]:
# fetch_html_page("https://www.washingtonpost.com/world/asia_pacific/north-korea-under-no-circumstances-will-give-up-its-nuclear-weapons/2017/08/07/33b8d319-fbb2-4559-8f7d-25e968913712_story.html", True)
# fetch_html_page("https://www.washingtonpost.com/news/post-politics/wp/2017/08/07/trump-says-his-political-base-is-stronger-than-ever-despite-polling-to-the-contrary/", True)
# fetch_html_page("https://www.washingtonpost.com/news/post-politics/wp/2017/08/07/trump-renews-attack-on-democratic-senator-calling-him-a-vietnam-con-artist-on-twitter/", True)
# fetch_html_page("https://www.washingtonpost.com/blogs/plum-line/wp/2017/08/07/as-mueller-closes-in-trump-prepares-his-base-for-the-worst/", True)
# fetch_html_page("https://www.washingtonpost.com/news/powerpost/paloma/daily-202/2017/08/07/daily-202-democrats-are-moving-left-and-that-won-t-necessarily-hurt-them-in-2018/5987ab1e30fb045fdaef114d/", True)
# fetch_html_page("https://www.washingtonpost.com/politics/apply-by-fax-before-it-can-hire-foreign-workers-trumps-mar-a-lago-club-advertises-at-home--briefly/2017/08/07/7198576c-792f-11e7-9eac-d56bd5568db8_story.html", True)
# fetch_html_page("https://www.washingtonpost.com/news/the-fix/wp/2017/08/07/trump-tvs-real-news-sounds-more-like-real-propaganda/", True)
# fetch_html_page("https://www.washingtonpost.com/news/energy-environment/wp/2017/08/07/the-arctics-fabled-passage-is-opening-up-this-is-what-it-looks-like/", True)
# fetch_html_page("https://www.washingtonpost.com/news/post-nation/wp/2017/08/07/chicago-to-sue-justice-department-over-new-police-grant-rules-targeting-sanctuary-cities/", True)
# fetch_html_page("https://www.washingtonpost.com/news/to-your-health/wp/2017/08/07/flesh-eating-sea-bugs-attacked-an-australian-teens-legs-there-was-no-stopping-the-bleeding/", True)
# fetch_html_page("https://www.washingtonpost.com/news/worldviews/wp/2017/08/07/british-model-feared-for-her-life-second-by-second-as-kidnappers-allegedly-plotted-online-auction/", True)

In [None]:
# all_articles_to_files("washingtonpost", True)

In [None]:
all_articles_to_corpus()