In [57]:
import requests
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup

# News Scraper
Objective:
To take the following information from several online news articles by Rappler:
- date
- title
- full article
- author

Exports the scraped article information to JSON

### Get URL using requests

In [58]:
url = 'https://www.rappler.com/'
webpage = requests.get(url)

### Parse HTML Data

In [59]:
soup = BeautifulSoup(webpage.content, 'html.parser')

In [60]:
# take top stories of main page
article_links = []
# page_main_head = soup.find('div', id = 'top-stories')

# can also take all of the articles on the main page
page_main_head = soup.find('div', class_ = 'container')

In [61]:
# takes url of all the articles on Rappler's main page
# parameters set in the find_all are specified such that it skips unecessary links such as share buttons, category links, etc.
for a in page_main_head.find_all('a', type = False, target = False, class_ = False):
    raw_link = a.get('href')
    article_links.append(raw_link)

In [62]:
#removes any duplicate links within the list
final_links = list(set(article_links))

### Create scraper function to be called per item on the link array

In [63]:
article_details = []
def scraper(site_url):
    url = site_url
    article_webpage = requests.get(url)
    sub_soup = BeautifulSoup(article_webpage.content, 'html.parser')
    article_hl = sub_soup.find('h1', class_ = 'post-single__title').text.strip()
    article_time = sub_soup.find('time', class_ = 'entry-date published').text.strip()
    article_auth = sub_soup.find('div', class_ = 'post-single__authors').text.strip()
    content = sub_soup.find('div', class_ = 'post-single__content entry-content')
    get_article_par = content.find_all('p')
    article_par = gettext(get_article_par)
    article_details.append((article_hl, article_time, article_auth, article_par))
    return article_details
    
def gettext(get_article_par):
    article_txt = []
    for par in get_article_par:
        text = par.text.strip()
        article_txt.append(text)
        article_full = " ".join(article_txt)
    return article_full

In [64]:
for link in final_links:
    articles = scraper(link)

### Format table to pandas dataframe then export to JSON

In [65]:
label = ['headline', 'date', 'author', 'full_article']
df_article_deets = pd.DataFrame(articles, columns = label)
df_article_deets.to_json('Rappler Articles JSON.json', orient = 'records')

In [66]:
df_article_deets

Unnamed: 0,headline,date,author,full_article
0,A prolonged China slowdown raises risks for gl...,"Apr 21, 2022 9:45 PM PHT",Reuters,"BOAO, China – A prolonged slowdown in China wo..."
1,Withdraw from race? ‘This fight is more than a...,"Apr 21, 2022 8:00 PM PHT",Bea Cupin,"CEBU, Philippines – Vice President Leni Robred..."
2,"CAMPAIGN TRAIL: Marcos Jr., Robredo stage back...",2 mins ago,Rappler.com,"Fresh off the days-long Holy Week break, presi..."
3,"Use less air-con, drive slower to shun Russian...","Apr 21, 2022 10:15 PM PHT",Reuters,"BRUSSELS, Belgium – Raise your air-conditioner..."
4,LIST: Complaints filed against red-tagger Lorr...,"Apr 21, 2022 7:55 PM PHT",Michelle Abad,"MANILA, Philippines – Communications Undersecr..."
5,Global tourism to recover from pandemic by 202...,"Apr 21, 2022 9:15 PM PHT",Reuters,"MANILA, Philippines – The global travel and to..."
6,"La Salle pulls off overtime escape, denies bre...","Apr 21, 2022 10:06 PM PHT",JR Isaga,"MANILA, Philippines – Winless UE gave La Salle..."
7,LIVE UPDATES: Overseas voting for the 2022 Phi...,2 hours ago,Jojo Dass,Over 1.6 million overseas Filipinos will begin...
8,LIVE UPDATES: Russia-Ukraine crisis,4 mins ago,Reuters,After weeks of escalating tensions in the regi...
9,Radisson Hotel Group plans Asia-Pacific expans...,"Apr 21, 2022 8:55 PM PHT",Reuters,"MANILA, Philippines – Radisson Hotel Group pla..."
