## Import Libraries

In [None]:
import requests
import json, os
from bs4 import BeautifulSoup as bs
import datetime

##Articles Scraper
- Title
- Date
- Content
- Summary
- ID

In [None]:
from requests.api import get
def get_id(url):
  '''Getting the ID of the news from its url'''
  return url.split('/')[-2]

def get_summary(text):
    summary = ''
    for line in text.split('\n'):
        if 'window.kmklabs.channel =' in line:
            target = line
            break
    temp=target.split('window.kmklabs.article = ')[1]
    temp=temp.split(';')[0]
    data = json.loads(temp)
    return data['shortDescription']

def collect_data(text):
  soup = bs(text)
  title = soup.find_all('h1', {'class': 'read-page--header--title'})[0].get_text()
  date = soup.find_all('time', {'class': 'read-page--header--author__datetime updated'})[0].get_text()
  contents = soup.find_all('div', {'class': 'article-content-body__item-content'})
  article = []
  for content in contents:
    article.append(content.get_text())
  summary = get_summary(text)
  return title, date, article, summary

def scrape_article(url):
  request = requests.get(url)
  url = request.url
  id = get_id(url)
  title, date, article, summary = collect_data(request.text)
  news_article = {
      'id': id,
      'title': title,
      'date': date,
      'article': article,
      'summary': summary
  }
  return news_article

def news_dataset_generator(url_list):
  dataset = []
  num_news = len(url_list)
  num_error = 0
  for i, url in enumerate(url_list):
    print('Scraping {}/{}'.format(i+1, num_news))
    try:
      news = scrape_article(url)
      dataset.append(news)
    except:
      num_error += 1
      print('Error scraping data {}/{}; ID {}; Error:{}'.format(i, num_news, get_id(url), num_error))
  return dataset


###Tes scraper

In [None]:

urls = ['https://www.liputan6.com/news/read/4934674/transjakarta-tutup-sementara-9-halte-mulai-15-april-2022', 'https://www.liputan6.com/news/read/4934691/panglima-andika-pastikan-tni-tidak-represif-saat-bantu-polri-jaga-demo-11-april-2022', 'https://www.liputan6.com/news/read/4951071/liputan6-update-laporan-mudik-dari-garut']
data = news_dataset_generator(urls)


Scraping 1/3
Scraping 2/3
Scraping 3/3


In [None]:
data

[{'article': ['Liputan6.com, Jakarta - PT Transjakarta berencana menutup sementara sembilan halte mulai 15 April 2022. Penutupan tersebut karena adanya revitalisasi di 11 halte bus Transjakarta.\n"Guna meningkatkan kelengkapan fasilitas dan kenyamanan pelanggan, Transjakarta akan melakukan revitalisasi di 11 halte. Mulai 15 April 2022, sembilan dari 11 halte akan dilakukan penutupan sementara," tulis Transjakarta dalam akun Twitter resminya @PT_Transjakarta di Jakarta, Sabtu (9/4/2022), seperti dikutip dari Antara.\n\n\nBaca Juga\n\nTransjakarta Perpanjang Jam Operasional Layanan hingga Pukul 22.00\nBus Transjakarta Sebabkan Kecelakaan Beruntun di Tol Jagorawi\nWagub DKI Jakarta soal Sudirman Said Jadi Komut Transjakarta: Punya Kemampuan\n\n\n\n\n\nSembilan halte Bus Transjakarta yang akan ditutup sementara mulai 15 April 2022 itu adalah:\n1. Halte Dukuh Atas 1\n2. Halte Tosari\n3. Halte Juanda\n4. Halte Cawang Cikoko\n5. Halte Bundaran HI\n6. Halte Sarinah\n7. Halte Kebon Pala\n8. Hal

##Get News URLs

In [None]:
def generate_date_range(date_range=30):
  base = datetime.datetime.today()
  date_range = [base - datetime.timedelta(days=x) for x in range(date_range)]
  date_list = []
  for date in date_range:
    #print (date.strftime("%d-%m-%Y"))
    date_format = date.strftime("%Y/%m/%d")
    date_list.append(date_format)
  return date_list

def get_article_url(url):
  request = requests.get(url)
  soup = bs(request.text)
  url_list = []
  links = soup.find_all('a', {'class': 'articles--rows--item__title-link'})
  if links:
    for link in links:
      link = link.get('href')
      url_list.append(link)
    return url_list

def generate_news_index_url(channel, date_range=30):
  base_url = 'https://www.liputan6.com'
  date_list = generate_date_range(date_range)
  index_urls = []
  for date in date_list:
    url = '{}/{}/indeks/{}'.format(base_url, channel, date)
    for page in range(5):
      link = '{}?page={}'.format(url, page+1)
      index_urls.append(link)
  return index_urls

In [None]:
get_article_url('https://www.liputan6.com/saham/indeks/2022/05/12?page=2')

##Scrape Multiple Articles URLs

In [None]:
# Lewatin ini dulu
def scraper_urls(channels):
  url_list = []
  for channel in channels:
    urls = generate_news_index_url(channel)
    for url in urls:
      url_list.append(url)

  news_urls = []

  for url in url_list:
    articles = get_article_url(url)
    try:
      for article in articles:
        news_urls.append(article)
    except:
      continue
  print(len(news_urls))
  
  return news_urls

###Save URLs as JSON

In [None]:
def save_data(title, data):
  with open(title, 'w', encoding='utf-8') as f:
    json.dump(data, f, ensure_ascii=False, indent=2)

def load_data(title):
  with open(title, encoding='utf-8') as f:
    return json.load(f)

In [None]:
channels = [['global', 'regional'], ['otomotif', 'tekno']]

for channel in channels:
  news_urls = scraper_urls(channel)
  title = ''.join(channel)
  filename = 'liputan6_{}_links.json'.format(title)
  save_data(filename, news_urls)

2088
921


In [None]:
save_data('liputan6_sahamCrypto_links.json', news_urls)

##Start scraping from Liputan6

In [None]:
import requests
import json, os
from bs4 import BeautifulSoup as bs
import datetime

In [None]:
# Load data
news_urls = load_data('liputan6_globalregional_links.json')
len(news_urls)

2088

In [None]:
dataset = news_dataset_generator(news_urls)

Scraping 1/2088
Error scraping data 0/2088; ID 4960708; Error:1
Scraping 2/2088
Scraping 3/2088
Error scraping data 2/2088; ID 4960540; Error:2
Scraping 4/2088
Scraping 5/2088
Scraping 6/2088
Scraping 7/2088
Scraping 8/2088
Scraping 9/2088
Scraping 10/2088
Error scraping data 9/2088; ID 4960512; Error:3
Scraping 11/2088
Scraping 12/2088
Scraping 13/2088
Scraping 14/2088
Scraping 15/2088
Scraping 16/2088
Scraping 17/2088
Scraping 18/2088
Scraping 19/2088
Scraping 20/2088
Scraping 21/2088
Scraping 22/2088
Scraping 23/2088
Scraping 24/2088
Scraping 25/2088
Scraping 26/2088
Scraping 27/2088
Error scraping data 26/2088; ID 4960184; Error:4
Scraping 28/2088
Scraping 29/2088
Error scraping data 28/2088; ID 4959784; Error:5
Scraping 30/2088
Scraping 31/2088
Scraping 32/2088
Scraping 33/2088
Scraping 34/2088
Scraping 35/2088
Scraping 36/2088
Scraping 37/2088
Scraping 38/2088
Scraping 39/2088
Scraping 40/2088
Error scraping data 39/2088; ID 4959724; Error:6
Scraping 41/2088
Scraping 42/2088
Erro

In [None]:
save_data('liputan6_globalregional_raw.json', dataset)