# BBC Burmese Webscraper (4): Scraping Headers, Dates, and Contents of Multiple Pages
by <a href="https://www.linkedin.com/in/la-wun-nannda-b047681b5/">`La Wun Nannda`</a>

## Libraries

In [1]:
# import libraries
from bs4 import BeautifulSoup # this module helps in web scrapping
import requests  # this module helps us to download a web page
import pandas as pd

## Functions for One Page

In [2]:
# function to get soup with URL input
def web_scraper(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html5lib')
    return soup

In [3]:
# function to get specific elements within a soup
def soup_parser(soup):
    news_headers_soup = soup.find_all("a", {"class":"focusIndicatorDisplayBlock"})
    datetime_soup = soup.find_all("time", {"class":"promo-timestamp"})
    return news_headers_soup, datetime_soup

In [4]:
# function to extract Burmese content from a content url
def content_scraper(content_url, soup):
    burmese_content = ""
    alphabets = ['a', 'b', 'c', 'd', 'e',
                'f', 'g', 'h', 'i', 'j',
                'k', 'l', 'm', 'n', 'o',
                'p', 'q', 'r', 's', 't',
                'u', 'v', 'w', 'x', 'y',
                'z']
    
    symbols = ["\'", "(", ")"]
    
    for p_element in soup.find_all("p"):
        try: # None Type can cause error
            content = p_element.string.strip()
            for char in content:
                if (char.lower() in alphabets) or (char in symbols): # do not add non-Burmese characters or symbols
                    continue
                burmese_content += char # add Burmese characters only
        except:
            pass
    return burmese_content

In [5]:
# function to create lists for a page
'''Lists are to be appended to original ones'''
def list_append_per_page(news_headers_soup, datetime_soup):
    news_headers_per_page = []
    datetime_per_page = []
    contents_per_page = []
    
    if len(news_headers_soup)==len(datetime_soup): # each header should have a corresponding date
        
        for i in range(len(news_headers_soup)): # get index of headers for one page
    
            # list 1 for multiple headers in a page
            try: # for news headers without video tag # video tagged ones will cause errors
                news_headers_per_page.append(news_headers_soup[i].string.strip()) # convert 'BeautifulSoup string' to 'Python string' # add content to list 1
            except AttributeError: # # for news headers with video tag
                '''list() is used to convert 'BeautifulSoup tag' object to 'list' to enable iteration'''
                news_headers_per_page.append(list(news_headers_soup[i].span)[1].strip()) # convert 'BeautifulSoup string' to 'Python string' # add content to list 1
    
            # list 2 for date and time in a page
            datetime_per_page.append(datetime_soup[i].string.strip()) # convert 'BeautifulSoup string' to 'Python string' # add content to list 2

            # list 3 for contents of all headers in a page (contents of multiple headers)
            content_url = news_headers_soup[i].attrs['href'] # get a link from 'n' element
            content_soup = web_scraper(content_url) # pass the link to create a new soup
            content_per_header = content_scraper(content_url, content_soup) # this new soup is used for content scraping
            contents_per_page.append(content_per_header)
        
        if (len(news_headers_per_page)==len(news_headers_soup)) & (len(datetime_per_page)==len(datetime_soup)) & (len(contents_per_page)!=0): # if everything is added to two lists
            return news_headers_per_page, datetime_per_page, contents_per_page

## Functions for Multiple Pages

In [6]:
# function to get next page url
def navigate_next_page(web_url, soup):
    next_page_soup = soup.find("a", {"aria-labelledby":"pagination-next-page", "class":"focusIndicatorOutlineBlack", "href":True})
    complete_url = web_url + next_page_soup.attrs['href']
    return complete_url

In [7]:
# function to get the last page index
def get_max_page(soup):
    last_page_soup = soup.find_all("a", {"class":"focusIndicatorOutlineBlack", "href":True})[-2]
    last_page_index = int(last_page_soup.string)
    return last_page_index

## Function for Excel

In [8]:
# function to produce a spreadsheet
def export_excel(first_list, second_list, third_list):
    BBC = {}
    BBC['News Header'] = first_list
    BBC['Time'] = second_list
    BBC['Content'] = third_list
    df = pd.DataFrame({key:pd.Series(value) for key, value in BBC.items()})
    df.to_excel('BBC_webscraped.xlsx', index=False)
    return df

## Scraping Every Page to Spreadsheet

In [9]:
# the main function
def main(web_url):
    news_headers = []
    datetime = []
    contents = []

    # for the first page
    complete_url = web_url # initial url
    soup = web_scraper(complete_url) # get soup
    last_page_index = get_max_page(soup) # get last page index
    news_headers_soup, datetime_soup = soup_parser(soup) # get specific elements in a soup
    print("Total pages:", last_page_index)
    
    # extract data
    news_headers_per_page, datetime_per_page, contents_per_page = list_append_per_page(news_headers_soup, datetime_soup)

    # append data to lists
    news_headers += news_headers_per_page
    datetime += datetime_per_page
    contents += contents_per_page
    
    # from the second page to the last page
    for i in range(last_page_index): 
        try:
            print(complete_url)
            complete_url = navigate_next_page(web_url, soup) # get next url
            soup = web_scraper(complete_url) # get soup
            news_headers_soup, datetime_soup = soup_parser(soup) # get specific elements in a soup

            # extract data
            news_headers_per_page, datetime_per_page, contents_per_page = list_append_per_page(news_headers_soup, datetime_soup)
            
            # append data to lists
            news_headers += news_headers_per_page
            datetime += datetime_per_page
            contents += contents_per_page
        
        except AttributeError: # scraping the page after the last page will cause error
            print("The end of the pages is reached.")

    return export_excel(news_headers, datetime, contents) # export the spreadsheet # return df

In [10]:
df = main("https://www.bbc.com/burmese/topics/c9wpm0en9jdt")

Total pages: 28
https://www.bbc.com/burmese/topics/c9wpm0en9jdt
https://www.bbc.com/burmese/topics/c9wpm0en9jdt?page=2
https://www.bbc.com/burmese/topics/c9wpm0en9jdt?page=3
https://www.bbc.com/burmese/topics/c9wpm0en9jdt?page=4
https://www.bbc.com/burmese/topics/c9wpm0en9jdt?page=5
https://www.bbc.com/burmese/topics/c9wpm0en9jdt?page=6
https://www.bbc.com/burmese/topics/c9wpm0en9jdt?page=7
https://www.bbc.com/burmese/topics/c9wpm0en9jdt?page=8
https://www.bbc.com/burmese/topics/c9wpm0en9jdt?page=9
https://www.bbc.com/burmese/topics/c9wpm0en9jdt?page=10
https://www.bbc.com/burmese/topics/c9wpm0en9jdt?page=11
https://www.bbc.com/burmese/topics/c9wpm0en9jdt?page=12
https://www.bbc.com/burmese/topics/c9wpm0en9jdt?page=13
https://www.bbc.com/burmese/topics/c9wpm0en9jdt?page=14
https://www.bbc.com/burmese/topics/c9wpm0en9jdt?page=15
https://www.bbc.com/burmese/topics/c9wpm0en9jdt?page=16
https://www.bbc.com/burmese/topics/c9wpm0en9jdt?page=17
https://www.bbc.com/burmese/topics/c9wpm0en9jdt?

In [11]:
# check the dataframe
df

Unnamed: 0,News Header,Time,Content
0,ဗီယက်နမ်ခေါင်းဆောင် ငုယင်ဖူချောင်း အသက် ၈ဝ မှာ...,၁၉ ဇူလိုင် ၂၀၂၄,ဗီယက်နမ်ရဲ့ သက်တမ်းရှည် ကွန်မြူနစ်ပါတီခေါင်းဆေ...
1,‘ကျွန်တော်လက်ထဲ ဘာလက်နက်မှမရှိဘူး’- ပစ်သတ်ခံခဲ...,၁၈ ဇူလိုင် ၂၀၂၄,အဘူ ဆာယိဒ်ဟာ ဘင်္ဂလားဒေ့ရှ်နိုင်ငံက ကျောင်းသား...
2,ဘန်ကောက် ဟိုတယ်ထဲ သေဆုံးသူ ၆ ဦးရဲ့သွေးထဲမှာ ဆိ...,၁၇ ဇူလိုင် ၂၀၂၄,ထိုင်းနိုင်ငံ၊ ဘန်ကောက်မြို့က နာမည်ကျော် ဟိုတယ...
3,ထိုင်းဗီဇာမလိုဘဲ ပြည်တွင်းဝင်ခွင့် နိုင်ငံ ၃၀ ...,၁၃ ဇူလိုင် ၂၀၂၄,ထိုင်းမှာ တနင်္လာနေ့ကစပြီး ဗီဇာမလိုဘဲ ပြည်တွင်...
4,ရိုးမရဲ့ ဘီလျံနာသူဌေးကြီး ဆာ့ချ်ပန်းကို စစ်ကော...,၁၁ ဇူလိုင် ၂၀၂၄,ရိုးမဘဏ်နဲ့ ရိုးမအုပ်စုတည်ထောင်သူ ဦးသိမ်းဝေ ခ ...
...,...,...,...
666,ကိုဗစ်ကြောင့် ထိုင်းမှာ ပိတ်ထားတာတွေကို ပြန်ဖွင့်,၈ မေ ၂၀၂၀,ထိုင်းနိုင်ငံမှာ ကိုဗစ်၁၉ ကူးစက်မှုတွေများလာတဲ...
667,ရှမ်းပြည်က ပြောင်းဖူးတွေ မြစ်ထဲ ပစ်ချနေရတဲ့ နေ...,၈ မေ ၂၀၂၀,မြန်မာ-တရုတ်နယ်စပ်မှာ ပြောင်းဖူးသွားရောင်းတဲ့တ...
668,အိန္ဒိယမှာ ကိုရိုနာဗိုင်းရပ်စ် ကူးစက်မှု ထိုးတက်,၅ မေ ၂၀၂၀,အိန္ဒိယမှာ တစ်နေ့တည်း ကူးစက်မှု အများဆုံး အဖြစ...
669,ကိုဗစ်ကြောင့်တန်နဲ့ချီ ပျက်စီးခဲ့ရတဲ့ တရုတ်နယ်...,၂၈ ဧပြီ ၂၀၂၀,မြန်မာနိုင်ငံ ကနေ တရုတ်နိုင်ငံကို နယ်စပ်ကနေ တင...
