### Import File

In [1]:
from bs4 import BeautifulSoup
from urllib.request import urlopen
import json
import re
import time

### if under proxy

just insert these codes

    import sys,os,os.path

    os.environ['HTTP_PROXY']="http_proxy=https://username:password@cache.itb.ac.id:8080"
    os.environ['HTTPS_PROXY']="https_proxy=https://username:password@cache.itb.ac.id:8080"

### Validation function

In [3]:
# to check whether url is a fauna/flora page or not
def isFloraFauna(url) :
    try :
        page = urlopen(url).read()
        soup = BeautifulSoup(page, 'html.parser')

        # every flora and fauna has Kingdom
        found = soup.find("td", text = "Kingdom:")
        
        # every flora and fauna has Species
        found = soup.find("span", {"class" : "species"})

        # found if it success and there is tag with those condition
        if found :
            return (True, page)
        else :
            return (False, page)
    except :
        return (False, None)

In [4]:
# to check whether the page is a national park page or not
def isNationalPark(url) :
    try :
        page = urlopen(url).read()
        soup = BeautifulSoup(page, 'html.parser')

        # every national park page in wikipedia is having a substring "National Park - Wikipedia" in their title
        found = soup.find("title", text = re.compile("^.*National Park - Wikipedia.*$"))

        # found if it success and there is tag with those condition
        if found :
            return (True, page)
        else :
            return (False, page)
    except :
        return (False, None)

### Scrapping function and their test case

In [5]:
# for scrapping flora and fauna
def scrapeFloraFauna(url, page) :
    result = {}

    soup = BeautifulSoup(page, 'html.parser')
    
    # get the name of flora/fauna
    title = soup.find("title").text
    title = title.replace(" - Wikipedia", "")
    result["name"] = title
    
    # get the image of flora/fauna
    image = soup.find("table", {"class" : re.compile("^.*infobox.*$")}).find("a", {"class" : "image"}).find("img")['src']
    result["image"] = image
    
    # get the kingdom of flora/fauna (to distinguish flora/fauna)
    sibling = soup.find("td", text = "Kingdom:")
    kind = sibling.find_next_sibling("td").find("a").text
    result["kingdom"] = kind

    # some flora/fauna has binomial name and other has trinomial name
    try :
        binomial = soup.find("span", {"class" : "binomial"}).find("i").text
        result["binomial-name"] = binomial
    except :
        # do nothing
        pass
    
    try :
        trinomial = soup.find("span", {"class" : "trinomial"}).find("i").text
        result["trinomial-name"] = trinomial
    except :
        # do nothing
        pass

    # some flora/fauna has the status of their existence which is grouped by IUCN Red List
    try :
        status = soup.find("a", {"title" : "IUCN Red List"}).find_parent("small").find_previous_sibling("a").text
        result["status"] = status
    except :
        # do nothing
        pass

    # note the url
    result["url"] = url

    return result

In [6]:
# test case url
url = 'https://en.wikipedia.org/wiki/Terminalia_catappa'
url = 'https://en.wikipedia.org/wiki/Dhole'

# test case
isKingdom, page = isFloraFauna(url)
if isKingdom :
    result = scrapeFloraFauna(url, page)
    print (json.dumps(result, indent=4, sort_keys=True))

{
    "binomial-name": "Cuon alpinus",
    "image": "//upload.wikimedia.org/wikipedia/commons/thumb/1/13/Indian_wild_dog_by_N._A._Naseer.jpg/220px-Indian_wild_dog_by_N._A._Naseer.jpg",
    "kingdom": "Animalia",
    "name": "Dhole",
    "status": "Endangered",
    "url": "https://en.wikipedia.org/wiki/Dhole"
}


In [8]:
# for scrapping national park
def scrapeNationalPark(url, page) :
    result = {}
    soup = BeautifulSoup(page, 'html.parser')
    
    # get the name of national park
    title = soup.find("title").text
    title = title.replace(" - Wikipedia", "")
    result["name"] = title
    
    # save the infobox html code (infobox is the box which contains the image, location and so on)
    infobox = soup.find("table", {"class" : re.compile("^.*infobox.*$")}).find_all("th")
    
    # get the image of national park
    image = soup.find("table", {"class" : re.compile("^.*infobox.*$")}).find("a", {"class" : "image"}).find("img")['src']
    result["image"] = image
    
    # for every <th> tag in infobox
    for unit in infobox :
        try : 
            sibling = unit.find_next_sibling("td")
            # removing non breaking space and change to the "normal" space
            # make it lower case
            # change space " " to dash "-"
            unitText = unit.text.encode('ascii','replace').decode("UTF-8").replace("?", " ").lower().replace(" ", "-")
            siblingText = sibling.text.encode('ascii', 'replace').decode('UTF-8').replace("?", " ")
            
            # remove info (if there is a foot note)
            siblingText = re.sub(r"\[.*\]", "", siblingText)

            # coordinates has a special case, because it contains longitude and latitude
            if (unitText == "coordinates") :
                result[unitText] = {}
                temp = sibling.find("a", text = "Coordinates").find_next_sibling("span")
                result[unitText]['longitude'] = temp.find("span", {"class" : "longitude"}).text
                result[unitText]['latitude'] = temp.find("span", {"class" : "latitude"}).text
            # the other case
            else : 
                result[unitText] = siblingText
        except :
            #do nothing
            pass
    
    # find all special flora and fauna in national park
    try :
        ffSpan = soup.find("span", {"id" : "Flora_and_fauna"})
        
        # this is to make sure that we find in between two <h2> tag
        ffHeading = ffSpan.find_parent("h2")
        ffSibling = ffHeading.find_next_sibling("h2")
        sibling = ffHeading.find_next_sibling()

        # to gather flora/fauna json
        result['flora'] = []
        result['fauna'] = []
        i = -1
        # while still in between two <h2> tag (with the first tag is <h2> with id = "Flora_and_fauna)
        while (sibling != ffSibling) :
            # get all sibling <a> tag
            listA = sibling.find_all("a")
            for a in listA :
                # add sleep because I don't want to be banned :(
                time.sleep(1)
                # get the real url
                ffUrl = "https://en.wikipedia.org" + a['href']
                
                # check if ffUrl is a flora/fauna page
                isKingdom, ffPage = isFloraFauna(ffUrl)
                if isKingdom :
                    print (i)
                    i = i - 1
                    jsonTemp = scrapeFloraFauna(ffUrl, ffPage)
                    # group the flora/fauna based on their kingdom
                    if jsonTemp["kingdom"] == 'Plantae' :
                        result['flora'].append(jsonTemp)
                    else :
                        result['fauna'].append(jsonTemp)
            
            # get the next sibling
            sibling = sibling.find_next_sibling()
    except :
        # do nothing because some of page do not give flora/fauna information
        pass
            
    return result

In [None]:
# test case url
url = 'https://en.wikipedia.org/wiki/Baluran_National_Park'

# test case
isPark, page = isNationalPark(url)

if isPark :
    result = scrapeNationalPark(url, page)
    print (json.dumps(result, indent=4, sort_keys=True))

{
    "area": "250 km2 (97 sq mi)",
    "coordinates": {
        "latitude": "7\u00b050\u2032S",
        "longitude": "114\u00b022\u2032E"
    },
    "established": "1980",
    "fauna": [
        {
            "binomial-name": "Bos javanicus",
            "image": "//upload.wikimedia.org/wikipedia/commons/thumb/0/02/Bos_javanicus_javanicus.jpg/220px-Bos_javanicus_javanicus.jpg",
            "kingdom": "Animalia",
            "name": "Banteng",
            "status": "Endangered",
            "url": "https://en.wikipedia.org/wiki/Banteng"
        },
        {
            "image": "//upload.wikimedia.org/wikipedia/commons/thumb/f/fb/Sumatran_dhole.jpg/220px-Sumatran_dhole.jpg",
            "kingdom": "Animalia",
            "name": "Sumatran dhole",
            "status": "Endangered",
            "trinomial-name": "Cuon alpinus sumatrensis",
            "url": "https://en.wikipedia.org/wiki/Sumatran_dhole"
        },
        {
            "binomial-name": "Muntiacus muntjak",
            

In [None]:
# big picture scrapper
def bigPicture(url) :
    result = {}
    result['national-park'] = []
    page = urlopen(url).read()
    soup = BeautifulSoup(page, 'html.parser')
    
    # find all link that has "National Park" in their title
    listPark = soup.find_all("a", {"title" : re.compile("^.*National Park.*$")})
    
    # just to iterate because we don't know if it still scrapping or not
    i = 1
    # for every link with that condition
    for park in listPark :
        # same reason with the other
        time.sleep(1)
        
        # for my needs
        print(i)
        i = i + 1
        
        # get the real url
        url = "https://en.wikipedia.org" + park['href']
        
        # verify whether the page contains national park information
        isPark, page = isNationalPark(url)
        if isPark :
            jsonTemp = scrapeNationalPark(url, page)
            result['national-park'].append(jsonTemp)
            
    return result

### Main Program

In [None]:
# the real main program
url = "https://en.wikipedia.org/wiki/List_of_national_parks_of_Indonesia"

data = bigPicture(url)

with open('data.json', 'w') as outfile:
    json.dump(data, outfile)