In [1]:
import requests
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup

# News Scraper
Objective:
To take the following information from several online news articles by Rappler:
- date
- title
- full article
- author

Exports the scraped article information to JSON

### Get URL using requests

In [2]:
url = 'https://www.rappler.com/'
webpage = requests.get(url)

### Parse HTML Data

In [3]:
soup = BeautifulSoup(webpage.content, 'html.parser')

In [4]:
# take top stories of main page
article_links = []
# page_main_head = soup.find('div', id = 'top-stories')

# can also take all of the articles on the main page
page_main_head = soup.find('div', class_ = 'container')

In [5]:
# takes url of all the articles on Rappler's main page
# parameters set in the find_all are specified such that it skips unecessary links such as share buttons, news category links, etc.
for a in page_main_head.find_all('a', type = False, target = False, class_ = False):
    raw_link = a.get('href')
    article_links.append(raw_link)

In [6]:
#removes any duplicate links within the list
final_links = list(set(article_links))

### Create scraper function to be called per item on the link array

In [7]:
article_details = []
def scraper(site_url):
    url = site_url
    article_webpage = requests.get(url)
    sub_soup = BeautifulSoup(article_webpage.content, 'html.parser')
    article_hl = sub_soup.find('h1', class_ = 'post-single__title').text.strip()
    article_time = sub_soup.find('time', class_ = 'entry-date published').text.strip()
    article_auth = sub_soup.find('div', class_ = 'post-single__authors').text.strip()
    content = sub_soup.find('div', class_ = 'post-single__content entry-content')
    get_article_par = content.find_all('p')
    article_par = gettext(get_article_par)
    article_details.append((article_hl, article_time, article_auth, article_par))
    return article_details
    
def gettext(get_article_par):
    article_txt = []
    for par in get_article_par:
        text = par.text.strip()
        article_txt.append(text)
        article_full = " ".join(article_txt)
    return article_full

In [8]:
for link in final_links:
    articles = scraper(link)

### Format list to pandas dataframe then export to JSON

In [9]:
label = ['headline', 'date', 'author', 'full_article']
df_article_deets = pd.DataFrame(articles, columns = label)
# drops live update articles since these articles do not have a proper date format (written as "x Hours Ago")
i = 0
for dates in df_article_deets.date:
    if len(dates.split()) == 3:
        df_article_deets = df_article_deets.drop([i])
    i += 1
df_article_deets.to_json('Rappler Articles JSON.json', orient = 'records')

In [10]:
df_article_deets

Unnamed: 0,headline,date,author,full_article
0,Korean leaders exchange friendly letters in ra...,"Apr 22, 2022 8:05 AM PHT",Reuters,"SEOUL, South Korea – North Korean leader Kim J..."
1,"‘Always running scared:’ Marcos, Duterte retur...","Apr 21, 2022 10:58 PM PHT",Lian Buan,"LAGUNA, Philippines – Ferdinand “Bongbong” Mar..."
2,"Brazil’s Bolsonaro pardons political ally, snu...","Apr 22, 2022 8:45 AM PHT",Reuters,"RIO DE JANEIRO, Brazil – Brazilian President J..."
3,"14,000 tourists stay in Ilocos Norte during Ho...","Apr 22, 2022 7:57 AM PHT",John Michael Mugas,"LAOAG CITY, Ilocos Norte – Tourists started tr..."
4,Musk secures $46.5 billion in funding for Twit...,"Apr 21, 2022 10:56 PM PHT",Reuters,Elon Musk has secured $46.5 billion in funding...
5,Big companies manage to pass on soaring costs ...,"Apr 21, 2022 10:45 PM PHT",Reuters,"ZURICH, Switzerland – Makers of chocolate bars..."
6,"If elected, Leody de Guzman will first solve h...","Apr 21, 2022 8:39 PM PHT",Jairo Bolledo,"MANILA, Philippines – Labor leader and preside..."
7,"Russian central bank eyes rate cut, warns of s...","Apr 22, 2022 8:40 AM PHT",Reuters,Russia’s central bank will consider cutting it...
8,Olsen Racela hopes win over NU serves as ‘turn...,"Apr 21, 2022 11:43 PM PHT",Martin Mendoza,"MANILA, Philippines – FEU Tamaraws head coach ..."
9,Florida set to strip Disney of self-governing ...,"Apr 22, 2022 8:34 AM PHT",Reuters,"Florida lawmakers on Thursday, April 21, gave ..."
