# BBC Burmese Webscraper (2): Scraping Headers and Dates of Multiple Pages
by <a href="https://www.linkedin.com/in/la-wun-nannda-b047681b5/">`La Wun Nannda`</a>

## Libraries

In [1]:
# import libraries
from bs4 import BeautifulSoup # this module helps in web scrapping
import requests  # this module helps us to download a web page
import pandas as pd

## Navigating Remaining Pages

In [2]:
# webscrape the main page
url = "https://www.bbc.com/burmese/topics/c9wpm0en9jdt"
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html5lib')

# links to other pages
other_pages_soup = soup.find_all("a", {"class":"focusIndicatorOutlineBlack", "href":True})
other_pages_soup

[<a aria-current="page" class="focusIndicatorOutlineBlack bbc-gtjcdn" href="?page=1">1</a>,
 <a class="focusIndicatorOutlineBlack bbc-1spja2a" href="?page=2">2</a>,
 <a class="focusIndicatorOutlineBlack bbc-1spja2a" href="?page=3">3</a>,
 <a class="focusIndicatorOutlineBlack bbc-1spja2a" href="?page=4">4</a>,
 <a class="focusIndicatorOutlineBlack bbc-1spja2a" href="?page=5">5</a>,
 <a class="focusIndicatorOutlineBlack bbc-1spja2a" href="?page=6">6</a>,
 <a class="focusIndicatorOutlineBlack bbc-1spja2a" href="?page=7">7</a>,
 <a class="focusIndicatorOutlineBlack bbc-1spja2a" href="?page=28">28</a>,
 <a aria-labelledby="pagination-next-page" class="focusIndicatorOutlineBlack bbc-1spja2a" href="?page=2"><span id="pagination-next-page"><span class="bbc-m04vo2">ရှေ့သို့သွားရန်</span><svg aria-hidden="true" focusable="false" height="12" viewBox="0 0 32 32" width="12"><path d="M21.6 14.3L5.5 31h6.4l14.6-15L11.9 1H5.5l16.1 16.7v-3.4z"></path></svg></span></a>]

In [3]:
# link to next page
next_page_soup = soup.find("a", {"aria-labelledby":"pagination-next-page", "class":"focusIndicatorOutlineBlack", "href":True})
next_page_soup

<a aria-labelledby="pagination-next-page" class="focusIndicatorOutlineBlack bbc-1spja2a" href="?page=2"><span id="pagination-next-page"><span class="bbc-m04vo2">ရှေ့သို့သွားရန်</span><svg aria-hidden="true" focusable="false" height="12" viewBox="0 0 32 32" width="12"><path d="M21.6 14.3L5.5 31h6.4l14.6-15L11.9 1H5.5l16.1 16.7v-3.4z"></path></svg></span></a>

In [4]:
# extract the adder link
adder_url = next_page_soup.attrs['href']
adder_url

'?page=2'

In [5]:
# get the HTML link to the next page
complete_url = url + adder_url
complete_url

'https://www.bbc.com/burmese/topics/c9wpm0en9jdt?page=2'

In [6]:
# get the last page index
last_page_soup = other_pages_soup[-2]
last_page_index = int(last_page_soup.string)
last_page_index

28

In [7]:
# try to loop getting urls until the last page
static_url = url
for i in range(last_page_index):
    try:
        print(complete_url)
        next_page_soup = soup.find("a", {"aria-labelledby":"pagination-next-page", "class":"focusIndicatorOutlineBlack", "href":True})
        adder_url = next_page_soup.attrs['href']
        complete_url = static_url + adder_url
        response = requests.get(complete_url)
        soup = BeautifulSoup(response.content, 'html5lib')
    except AttributeError: # scraping the page after the last page will cause error
        print("The end of the pages is reached.")

https://www.bbc.com/burmese/topics/c9wpm0en9jdt?page=2
https://www.bbc.com/burmese/topics/c9wpm0en9jdt?page=2
https://www.bbc.com/burmese/topics/c9wpm0en9jdt?page=3
https://www.bbc.com/burmese/topics/c9wpm0en9jdt?page=4
https://www.bbc.com/burmese/topics/c9wpm0en9jdt?page=5
https://www.bbc.com/burmese/topics/c9wpm0en9jdt?page=6
https://www.bbc.com/burmese/topics/c9wpm0en9jdt?page=7
https://www.bbc.com/burmese/topics/c9wpm0en9jdt?page=8
https://www.bbc.com/burmese/topics/c9wpm0en9jdt?page=9
https://www.bbc.com/burmese/topics/c9wpm0en9jdt?page=10
https://www.bbc.com/burmese/topics/c9wpm0en9jdt?page=11
https://www.bbc.com/burmese/topics/c9wpm0en9jdt?page=12
https://www.bbc.com/burmese/topics/c9wpm0en9jdt?page=13
https://www.bbc.com/burmese/topics/c9wpm0en9jdt?page=14
https://www.bbc.com/burmese/topics/c9wpm0en9jdt?page=15
https://www.bbc.com/burmese/topics/c9wpm0en9jdt?page=16
https://www.bbc.com/burmese/topics/c9wpm0en9jdt?page=17
https://www.bbc.com/burmese/topics/c9wpm0en9jdt?page=18
h

## Making Everything a Function

In [8]:
# function to get soup with URL input
def web_scraper(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html5lib')
    return soup

In [9]:
# function to get specific elements within a soup
def soup_parser(soup):
    news_headers_soup = soup.find_all("a", {"class":"focusIndicatorDisplayBlock"})
    datetime_soup = soup.find_all("time", {"class":"promo-timestamp"})
    return news_headers_soup, datetime_soup

In [10]:
# function to create lists that are to be appended to original ones
def list_append(news_headers_soup, datetime_soup):
    news_headers_per_page = []
    datetime_per_page = []
    if len(news_headers_soup)==len(datetime_soup): # if the lengths are the same, start adding them to the lists
        
        for i in range(len(news_headers_soup)): # get index
    
            # list 1 for news headers
            try: # for news headers without video tag # video tagged ones will cause errors
                news_headers_per_page.append(news_headers_soup[i].string.strip()) # convert 'BeautifulSoup string' to 'Python string' # add content to list 1
            except AttributeError: # # for news headers with video tag
                # list() is used to convert 'BeautifulSoup tag' object to 'list' to enable iteration 
                news_headers_per_page.append(list(news_headers_soup[i].span)[1].strip()) # convert 'BeautifulSoup string' to 'Python string' # add content to list 1
    
            # list 2 for date and time
            datetime_per_page.append(datetime_soup[i].string.strip()) # convert 'BeautifulSoup string' to 'Python string' # add content to list 2
            
        if (len(news_headers_per_page)==len(news_headers_soup)) & (len(datetime_per_page)==len(datetime_soup)): # if everything is added to two lists
            return news_headers_per_page, datetime_per_page

In [11]:
# function to get next page url
def navigate_next_page(web_url, soup):
    next_page_soup = soup.find("a", {"aria-labelledby":"pagination-next-page", "class":"focusIndicatorOutlineBlack", "href":True})
    complete_url = web_url + next_page_soup.attrs['href']
    return complete_url

In [12]:
# function to get the last page index
def get_max_page(soup):
    last_page_soup = soup.find_all("a", {"class":"focusIndicatorOutlineBlack", "href":True})[-2]
    last_page_index = int(last_page_soup.string)
    return last_page_index

In [13]:
# function to produce a spreadsheet
def export_excel(first_list, second_list):
    BBC = {}
    BBC['News Header'] = first_list
    BBC['Time'] = second_list
    df = pd.DataFrame({key:pd.Series(value) for key, value in BBC.items()})
    df.to_excel('BBC_webscraped.xlsx', index=False)
    return df

## Scraping Every Page to Spreadsheet

In [14]:
# the main function
def main(web_url):
    news_headers = []
    datetime = []

    # for the first page
    complete_url = web_url # initial url
    soup = web_scraper(complete_url) # get soup
    last_page_index = get_max_page(soup) # get last page index
    news_headers_soup, datetime_soup = soup_parser(soup) # get specific elements
    news_headers_new_list, datetime_new_list = list_append(news_headers_soup, datetime_soup) # extract data
    
    # append data to lists
    news_headers += news_headers_new_list
    datetime += datetime_new_list
    
    # from the second page to the last page
    for i in range(last_page_index): 
        try:
            print(complete_url)
            complete_url = navigate_next_page(web_url, soup) # get next url
            soup = web_scraper(complete_url) # get soup
            news_headers_soup, datetime_soup = soup_parser(soup) # get specific elements
            news_headers_new_list, datetime_new_list = list_append(news_headers_soup, datetime_soup) # extract data
    
            # append data to lists
            news_headers += news_headers_new_list
            datetime += datetime_new_list
        
        except AttributeError: # scraping the page after the last page will cause error
            print("The end of the pages is reached.")

    return export_excel(news_headers, datetime) # export the spreadsheet # return df

In [15]:
df = main("https://www.bbc.com/burmese/topics/c9wpm0en9jdt")

https://www.bbc.com/burmese/topics/c9wpm0en9jdt
https://www.bbc.com/burmese/topics/c9wpm0en9jdt?page=2
https://www.bbc.com/burmese/topics/c9wpm0en9jdt?page=3
https://www.bbc.com/burmese/topics/c9wpm0en9jdt?page=4
https://www.bbc.com/burmese/topics/c9wpm0en9jdt?page=5
https://www.bbc.com/burmese/topics/c9wpm0en9jdt?page=6
https://www.bbc.com/burmese/topics/c9wpm0en9jdt?page=7
https://www.bbc.com/burmese/topics/c9wpm0en9jdt?page=8
https://www.bbc.com/burmese/topics/c9wpm0en9jdt?page=9
https://www.bbc.com/burmese/topics/c9wpm0en9jdt?page=10
https://www.bbc.com/burmese/topics/c9wpm0en9jdt?page=11
https://www.bbc.com/burmese/topics/c9wpm0en9jdt?page=12
https://www.bbc.com/burmese/topics/c9wpm0en9jdt?page=13
https://www.bbc.com/burmese/topics/c9wpm0en9jdt?page=14
https://www.bbc.com/burmese/topics/c9wpm0en9jdt?page=15
https://www.bbc.com/burmese/topics/c9wpm0en9jdt?page=16
https://www.bbc.com/burmese/topics/c9wpm0en9jdt?page=17
https://www.bbc.com/burmese/topics/c9wpm0en9jdt?page=18
https://

In [16]:
# check the dataframe
df

Unnamed: 0,News Header,Time
0,ဗီယက်နမ်ခေါင်းဆောင် ငုယင်ဖူချောင်း အသက် ၈ဝ မှာ...,၁၉ ဇူလိုင် ၂၀၂၄
1,‘ကျွန်တော်လက်ထဲ ဘာလက်နက်မှမရှိဘူး’- ပစ်သတ်ခံခဲ...,၁၈ ဇူလိုင် ၂၀၂၄
2,ဘန်ကောက် ဟိုတယ်ထဲ သေဆုံးသူ ၆ ဦးရဲ့သွေးထဲမှာ ဆိ...,၁၇ ဇူလိုင် ၂၀၂၄
3,ထိုင်းဗီဇာမလိုဘဲ ပြည်တွင်းဝင်ခွင့် နိုင်ငံ ၃၀ ...,၁၃ ဇူလိုင် ၂၀၂၄
4,ရိုးမရဲ့ ဘီလျံနာသူဌေးကြီး ဆာ့ချ်ပန်းကို စစ်ကော...,၁၁ ဇူလိုင် ၂၀၂၄
...,...,...
666,ကိုဗစ်ကြောင့် ထိုင်းမှာ ပိတ်ထားတာတွေကို ပြန်ဖွင့်,၈ မေ ၂၀၂၀
667,ရှမ်းပြည်က ပြောင်းဖူးတွေ မြစ်ထဲ ပစ်ချနေရတဲ့ နေ...,၈ မေ ၂၀၂၀
668,အိန္ဒိယမှာ ကိုရိုနာဗိုင်းရပ်စ် ကူးစက်မှု ထိုးတက်,၅ မေ ၂၀၂၀
669,ကိုဗစ်ကြောင့်တန်နဲ့ချီ ပျက်စီးခဲ့ရတဲ့ တရုတ်နယ်...,၂၈ ဧပြီ ၂၀၂၀
