# Webscraping hilarity from theonion.com:  Gathering satirical news content as a dataset to train our model later.

In [1]:
import pandas as pd
import bs4 as bs
import urllib.request

## Creating defs

In [2]:
# def to scrap article urls from news feed pages
def get_article_urls(newsfeed_page):
    ''' take in newsfeed_page and return article_urls list '''

    # open article_url and create soup content
    web_content = urllib.request.urlopen(newsfeed_page).read()
    soup = bs.BeautifulSoup(web_content,'lxml')

    # set articles to list of articles found in soup object body
    articles = soup.body.find_all('article')

    # init lists for parsing article URLs
    article_urls = []
    article_a_list = []


    # loop through articles to grap each article url on the page
    for artcl in articles:

        # create a list of all a tags in the artigle
        article_a_list = artcl.find_all('a')

        # grab string text from data-ga attribute for each a tag
        article_a_attr = list(article_a_list[1].attrs['data-ga'].split(','))

        # slice string to get article url text
        article_url = article_a_attr[2][1:-1]

        # append article url to article_urls list
        article_urls.append(article_url)

    return article_urls


# def to scrap article title and content from article url
def get_article_content(article_url):
    '''
    take article_url, parse and return article title and content 
    '''
    
    # open article_url and create soup content
    web_content = urllib.request.urlopen(article_url).read()
    soup = bs.BeautifulSoup(web_content,'lxml')

    # set articles to list of articles found in soup object body
    titles = soup.body.find_all('h1')

    # set title from title_attr string method, converted to str
    title = str(titles[0].string)

    # get article content
    paras = soup.body.find_all('p')
    content = ''

    for para in paras:
        contents = para.contents
        for section in contents:
            content = content + str(section)

    return pd.Series([title,content], index=['article_title','article_content'])
    

## Set constants for newsfeed base url and num newsfeed pages to scrape. Assuming 20 articles per page.

In [3]:
# create a list of web_pages to parse for article URLs
num_base_pages = 221   # this appears to be ALL the newsfeed pages going back to 1990s
base_url = 'https://www.theonion.com/c/news?startIndex='

theonion_base_urls = []
count = 1

for num_pages in range(num_base_pages):
    theonion_base_urls.append(base_url + str(count*20))
    count += 1

In [4]:
# spot check newsfeed base urls
theonion_base_urls

['https://www.theonion.com/c/news?startIndex=20',
 'https://www.theonion.com/c/news?startIndex=40',
 'https://www.theonion.com/c/news?startIndex=60',
 'https://www.theonion.com/c/news?startIndex=80',
 'https://www.theonion.com/c/news?startIndex=100',
 'https://www.theonion.com/c/news?startIndex=120',
 'https://www.theonion.com/c/news?startIndex=140',
 'https://www.theonion.com/c/news?startIndex=160',
 'https://www.theonion.com/c/news?startIndex=180',
 'https://www.theonion.com/c/news?startIndex=200',
 'https://www.theonion.com/c/news?startIndex=220',
 'https://www.theonion.com/c/news?startIndex=240',
 'https://www.theonion.com/c/news?startIndex=260',
 'https://www.theonion.com/c/news?startIndex=280',
 'https://www.theonion.com/c/news?startIndex=300',
 'https://www.theonion.com/c/news?startIndex=320',
 'https://www.theonion.com/c/news?startIndex=340',
 'https://www.theonion.com/c/news?startIndex=360',
 'https://www.theonion.com/c/news?startIndex=380',
 'https://www.theonion.com/c/news?s

## Get individual article urls with get_article_urls function

In [5]:
# gather article urls
article_urls = []
for base_url in theonion_base_urls:
    page_article_urls = get_article_urls(base_url)
    
    for article_url in page_article_urls:
        article_urls.append(article_url)

In [6]:
# spot check article urls
article_urls

['https://ogn.theonion.com/blatant-rip-off-the-main-character-in-ghost-of-tsushi-1844415830',
 'https://ogn.theonion.com/deal-alert-an-advance-copy-of-cyberpunk-2077-is-sitt-1844339302',
 'https://ogn.theonion.com/get-excited-gamers-activision-shot-down-a-french-plan-1844281289',
 'https://ogn.theonion.com/come-on-someone-just-spray-painted-gamers-rule-on-th-1844231128',
 'https://ogn.theonion.com/brutal-playstation-has-cancelled-the-entire-ps5-game-l-1844017062',
 'https://ogn.theonion.com/banjo-kazooie-fans-will-love-this-this-man-threw-his-1843980382',
 'https://ogn.theonion.com/major-hype-gamers-have-been-divorcing-their-spouses-be-1843968979',
 'https://ogn.theonion.com/letdown-naughty-dog-says-they-worked-so-hard-on-the-l-1843923560',
 'https://ogn.theonion.com/major-relief-blizzard-has-announced-that-overwatch-p-1843751553',
 'https://ogn.theonion.com/inspiring-cd-projekt-red-to-immortalize-programmers-wh-1843710889',
 'https://ogn.theonion.com/complete-bullshit-designers-of-thi

## Get article content

In [7]:
# create data frame for articles and populate with get_article_content function
articles_df = pd.DataFrame(columns = ['article_title','article_content'])

for aurl in article_urls:
    article_series = pd.Series()
    article_series = get_article_content(aurl)
    articles_df = articles_df.append(article_series, ignore_index=True)

In [8]:
articles_df.head()

Unnamed: 0,article_title,article_content
0,Blatant Rip-Off: The Main Character In ‘Ghost ...,"Well, gamers, this is a huge letdown. After ye..."
1,Deal Alert: An Advance Copy Of ‘Cyberpunk 2077...,"All aboard, gamers! We’ve uncovered a once-in-..."
2,"Get Excited, Gamers! Activision Shot Down A Fr...",Here is thrilling news that should have every ...
3,Come On: Someone Just Spray-Painted ‘Gamers Ru...,"Gamers, ever since our founding, we have pride..."
4,Brutal: Playstation Has Cancelled The Entire P...,"Well, Playstation fans, it looks like we’re al..."


## Save article content (DataFrame) to CSV

In [9]:
# save dataframe to csv
articles_df.to_csv(r'./Resources/theonion_articles_raw.csv', index = False)