# CASE STUDY 1 - GROUP 2
### Members
- Bautista, Millette
- Dano, Jomari
- Narzoles, John Peter
- Roranes, Raven Rain
- Siaotong, Danica

---

### SETUP

In [3]:
import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
import pandas as pd
from bs4 import BeautifulSoup
import re


base_url = "https://philatlas.com/"

---

### FUNCTIONS

In [4]:
def requests_retry_session(
    retries=5,
    backoff_factor=0.3,
    status_forcelist=(500, 502, 504),
    session=None,
):
    session = session or requests.Session()
    retry = Retry(
        total=retries,
        read=retries,
        connect=retries,
        backoff_factor=backoff_factor,
        status_forcelist=status_forcelist,
    )
    adapter = HTTPAdapter(max_retries=retry)
    session.mount('http://', adapter)
    session.mount('https://', adapter)
    return session


# url = webpage
# element = target element
# element_class = class of target element
def get_specific_element(url, element, element_class="", _id=""):
    # page = requests.get(url)
    page = requests_retry_session().get(url)
    if page.status_code != 200:
        return "Error on accessing Webpage"
    soup = BeautifulSoup(page.content, 'html.parser')

    if element_class == "":
        return soup.find(element, id=_id)

    if _id == "":
        return soup.find(element, class_=element_class)

    return soup.find(element, class_=element_class, id=_id)

---

### Output 1: Island Group, Region

In [5]:
# VARIABLES

# OUTPUT 1
island_groups = []
regions = []
url = f"{base_url}island-groups.html"

# OUTPUT 2
REGIONS = {}

In [6]:
right_table = get_specific_element(url, 'table', 'generic-table')

for row in right_table.find_all('tr'):
    #kinuha island group kasi di sya kasama sa td

    island = row.find_all('th', scope = 'row')
    if len(island) > 0:
    #     #finind ko yung laman ul para ilabas lahat ng mga list na nandun.
        _region = row.find('ul').text
        island_groups.append(island[0].find(text=True))
        regions.append(_region+' ')


        # SETUP FOR OUTPUT 2
        links = row.find('ul')
        for _url in links.find_all('a'):
            _region_url = base_url + _url.get('href')
            _txt = _url.get_text()
            _region_name = _txt.split(' – ')[0].split(' ')[0]
            REGIONS[_region_name] = _region_url

In [7]:
df_output1 = pd.DataFrame(island_groups, columns=['Island group'])
df_output1['Component regions'] = regions

In [32]:
df_output1.to_csv('../Output/Output1.csv')

---

### Output 2: Region, Province Name, Type, Population (2015), Population (2010), Annual Population Growth Rat (2010 - 2015), Area (2007 in km2), Density (2015 per km2), City Count, Mun Count, Brgy Count

In [8]:
# VARIABLES
# REGIONS
output2 = pd.DataFrame()

# FOR OUTPUT 3
output3 = pd.DataFrame()
PROVINCES = {}

# FOR OUTPUT 4
MUNICIPALITIES = {}



In [9]:
for name, link in REGIONS.items():
    right_table = get_specific_element(link, 'table', 'sortable datatable')

    # VARIABLE INITIALIZATION
    Region = []
    Province = []
    Type = []
    Population_2020 = []
    Population_2015 = []
    APGR_2015_2020 = []
    Area_2013 = []
    Density_2020 = []
    City_count = []
    Mun_count = []
    Brgy_count = []



    # VARIABLE INITIALIZATION for df_output3
    df_output3 = None
    o3_Province = []
    o3_Municipality = []
    o3_Type = []
    o3_Population_2020 = []
    o3_Population_2015 = []
    o3_APGR_2015_2020 = []
    o3_Area_2013 = []
    o3_Density_2020 = []
    o3_Brgy_count = []


    for row in right_table.find_all("tr"):
        cells = row.find_all('td')
        #kunin lahat ng "a" para makuha yung mga region name
        province = row.find_all('a')




        if len(province) > 0:
            if len(cells) > 0:
                Region.append(name)
                Province.append(province[0].get_text())
                Type.append(cells[0].get_text())
                Population_2020.append(cells[1].get_text())
                Population_2015.append(cells[2].get_text())
                APGR_2015_2020.append(cells[3].get_text())
                Area_2013.append(cells[4].get_text())
                Density_2020.append(cells[5].get_text())

                # SETUP FOR OUTPUT 3
                if (cells[0].get_text()).lower() == 'province':
                    for link in province:
                        _province_url = base_url + link.get('href')
                        # print(link.text)
                        PROVINCES[link.text] = _province_url
                else:
                    # SETUP FOR OUTPUT 4
                    for link in province:
                        _province_url = base_url + link.get('href')
                        # print(link.text)
                        MUNICIPALITIES[(province[0].get_text())+'|'+link.text] = _province_url

                    o3_Province.append(name)
                    o3_Municipality.append(province[0].get_text())
                    o3_Type.append(cells[0].get_text())
                    o3_Population_2020.append(cells[1].get_text())
                    o3_Population_2015.append(cells[2].get_text())
                    o3_APGR_2015_2020.append(cells[3].get_text())
                    o3_Area_2013.append(cells[4].get_text())
                    o3_Density_2020.append(cells[5].get_text())
                    o3_Brgy_count.append(cells[6].get_text())


                # if name != 'NCR':
                if len(cells) > 7:
                    City_count.append(cells[6].get_text())
                    Mun_count.append(cells[7].get_text())
                    Brgy_count.append(cells[8].get_text())


                else:
                    City_count.append('-')
                    Mun_count.append('-')
                    Brgy_count.append(cells[6].get_text())




    df_output2 = pd.DataFrame()
    df_output2['Region'] = Region
    df_output2['Province Name'] = Province
    df_output2['Type'] = Type
    df_output2['Population (2020)'] = Population_2020
    df_output2['Population (2015)'] = Population_2015
    df_output2['Annual Population Growth Rate (2015‑2020)'] = APGR_2015_2020
    df_output2['Area (2013), in km2'] = Area_2013
    df_output2['Density (2020), per km2'] = Density_2020

    if len(City_count) > 0:
        df_output2['City count'] = City_count

    if len(Mun_count) > 0:
        df_output2['Mun count'] = Mun_count


    df_output3 = pd.DataFrame()
    df_output3['Province'] = o3_Province
    df_output3['Municipality Name'] = o3_Municipality
    df_output3['Type'] = o3_Type
    df_output3['Population (2020)'] = o3_Population_2020
    df_output3['Population (2015)'] = o3_Population_2015
    df_output3['Annual Population Growth Rate (2015‑2020)'] = o3_APGR_2015_2020
    df_output3['Area (2013), in km2'] = o3_Area_2013
    df_output3['Density (2020), per km2'] = o3_Density_2020

    df_output3['Brgy count'] = o3_Brgy_count

    output3 = output3.append(df_output3)

    

    df_output2['Brgy count'] = Brgy_count
    output2 = output2.append(df_output2)

In [35]:
# PROVINCES
# len(PROVINCES)
# len(MUNICIPALITIES)
# MUNICIPALITIES
# pd.DataFrame.from_dict(enumerate(MUNICIPALITIES))

In [36]:
# print(output2)
output2.to_csv(f'../Output/Output2.csv')

In [37]:
# output2.describe()

---

### Output 3: Province, Municipality name, Type, Population (2015), Population (2010), Annual Population Growth Rate (2010 - 2015), Area (2007 in km2), Density (2015 per km2), Brgy Count

In [38]:
# VARIABLES


In [10]:
for i, (name,link) in enumerate(PROVINCES.items()):

    right_table = get_specific_element(link, 'table', 'sortable datatable')

    # VARIABLE INITIALIZATION
    df_output3 = None
    Province = []
    Municipality = []
    Type = []
    Population_2020 = []
    Population_2015 = []
    APGR_2015_2020 = []
    Area_2013 = []
    Density_2020 = []
    Brgy_count = []

    # CITY / Municipality
    Population_percentage_2020 = []
    Change_2015_2020 = []

    for row in right_table.find_all("tr"):
        cells = row.find_all('td')
        municipality = row.find_all('a')


        if len(municipality) > 0:

            # SETUP FOR OUTPUT 4
            for _link in municipality:
                _municipality_url = base_url + _link.get('href')
                MUNICIPALITIES[name+'|'+_link.text] = _municipality_url

            if len(cells) > 0:

                # PROVINCE
                Province.append(name)
                Municipality.append(municipality[0].get_text())
                Type.append(cells[0].get_text())
                Population_2020.append(cells[1].get_text())
                Population_2015.append(cells[2].get_text())
                APGR_2015_2020.append(cells[3].get_text())
                Area_2013.append(cells[4].get_text())
                Density_2020.append(cells[5].get_text())
                Brgy_count.append(cells[6].get_text())



    df_output3 = pd.DataFrame()
    df_output3['Province'] = Province
    df_output3['Municipality Name'] = Municipality
    df_output3['Type'] = Type
    df_output3['Population (2020)'] = Population_2020
    df_output3['Population (2015)'] = Population_2015
    df_output3['Annual Population Growth Rate (2015‑2020)'] = APGR_2015_2020
    df_output3['Area (2013), in km2'] = Area_2013
    df_output3['Density (2020), per km2'] = Density_2020

    df_output3['Brgy count'] = Brgy_count

    output3 = output3.append(df_output3)

In [40]:
# MUNICIPALITIES
# len(MUNICIPALITIES)
# pd.DataFrame.from_dict(enumerate(MUNICIPALITIES))

In [41]:

# output3.describe()
# output3.nunique()

In [42]:

    # PROVINCES
    # REGIONS

    # print(output3)
    output3.to_csv(f'../Output/Output3.csv')
    # output3.describe()


    # for index,(name,link) in enumerate(PROVINCES.items()):
    #     print(PROVINCES_TYPE[index])
    #     print(name)

---

### Output 4: Municipality Name, Barangay Name, Population Percentage (2015), Population (2015), Population (2010), Change (2010 - 2015), Annual Population Growth Rate (2010 - 2015), Postal Code, Coastal/Landlocked, Marine Waterbodies, Coordinates, Estimated Elevation above sea level

In [146]:
# output4 = pd.DataFrame()

# len(MUNICIPALITIES)
# MUNICIPALITIES

# N = 9
# limited_MUNICIPALITIES = dict(list(MUNICIPALITIES.items())[5: N])

#! BUG: Multiple Doble yung rows
#SKIPPED 24 = Cebu City
# limited_MUNICIPALITIES = dict(list(MUNICIPALITIES.items())[24:25])
# SKIPPED 29 = Zamboanga City
# limited_MUNICIPALITIES = dict(list(MUNICIPALITIES.items())[29:31])
# SKIPPED 35 = Butuan
# limited_MUNICIPALITIES = dict(list(MUNICIPALITIES.items())[35:36])
# ========= DONE ADDING

#! BUG: Table recognized as NONE
# SKIPPED 36 = Adams
# limited_MUNICIPALITIES = dict(list(MUNICIPALITIES.items())[36:37])
# SKIPPED 47 = Lagaog
# limited_MUNICIPALITIES = dict(list(MUNICIPALITIES.items())[47:48])
# ========= DONE ADDING



limited_MUNICIPALITIES = dict(list(MUNICIPALITIES.items())[1267:])
limited_MUNICIPALITIES

# output4 = pd.read_csv('../Output/Output4.csv')
# o4 = pd.read_csv('../Output/Output4.csv')
# output4

# o4.drop(columns=o4.columns[:1], axis=1, inplace=True)
# o4.append(output4)
# o4

{'Davao Occidental|Jose Abad Santos': 'https://philatlas.com/mindanao/r11/davao-occidental/jose-abad-santos.html',
 'Davao Occidental|Malita': 'https://philatlas.com/mindanao/r11/davao-occidental/malita.html',
 'Davao Occidental|Santa Maria': 'https://philatlas.com/mindanao/r11/davao-occidental/santa-maria.html',
 'Davao Occidental|Sarangani': 'https://philatlas.com/mindanao/r11/davao-occidental/sarangani.html',
 'Davao Oriental|Baganga': 'https://philatlas.com/mindanao/r11/davao-oriental/baganga.html',
 'Davao Oriental|Banaybanay': 'https://philatlas.com/mindanao/r11/davao-oriental/banaybanay.html',
 'Davao Oriental|Boston': 'https://philatlas.com/mindanao/r11/davao-oriental/boston.html',
 'Davao Oriental|Caraga': 'https://philatlas.com/mindanao/r11/davao-oriental/caraga.html',
 'Davao Oriental|Cateel': 'https://philatlas.com/mindanao/r11/davao-oriental/cateel.html',
 'Davao Oriental|Governor Generoso': 'https://philatlas.com/mindanao/r11/davao-oriental/governor-generoso.html',
 'Dava

In [139]:
# for i, (name,link) in enumerate(MUNICIPALITIES.items()):
for i, (name,link) in enumerate(limited_MUNICIPALITIES.items()):

    # Get correct name since the format is REGION NAME|MUNICIPALITY NAME
    municipality_name = name.split('|')[1]


    # right_table = get_specific_element(link, 'table', 'sortable datatable')
    right_table = get_specific_element(link, 'table', '', 'lguTable')
    municipality_summary = get_specific_element(link, 'table', 'iBox')




    # initialize blank lists
    Municipality = []
    Brgy = []
    Population_per = []
    Population_2020 = []
    Population_2015 = []
    Change_2015_2020 = []
    APGR_2015_2020 = []
    Postal = []
    CoastalLandLocked = []
    Marine = []
    Coordinates = []
    Sea_level = []




    for row in right_table.find_all("tr"):
        cells = row.find_all('td')
        barangay = row.find_all('a')

        if len(barangay) > 0:
            if len(cells) > 0:
                Municipality.append(municipality_name)
                Brgy.append(barangay[0].find(text=True))
                Population_per.append(cells[0].find(text=True))
                Population_2020.append(cells[1].find(text=True))
                Population_2015.append(cells[2].find(text=True))
                Change_2015_2020.append(cells[3].find(text=True))
                APGR_2015_2020.append(cells[4].find(text=True))


                # GET BARANGAY
                for _link in barangay:
                    _barangay_url = base_url + _link.get('href')

                    brgy_summary = get_specific_element(_barangay_url, 'table', 'iBox')
                    brgy_rows = brgy_summary.find_all('tr')


                    haveSeaLevel = brgy_summary.find(text=re.compile("Estimated"))
                    if not haveSeaLevel:
                        Sea_level.append("NULL")

                    havePostal = brgy_summary.find(text=re.compile("Postal"))
                    if not havePostal:
                        Postal.append("NULL")

                    haveCoordinates = brgy_summary.find(text=re.compile("Coordinates"))
                    if not haveCoordinates:
                        Coordinates.append("NULL")


                    for _row in brgy_rows:
                        _h = _row.find('th')

                        _summary_title = (_h.get_text()).casefold()

                        if _summary_title == 'postal code'.casefold():
                            Postal.append(_h.next_sibling.get_text())

                        if _summary_title == 'coordinates'.casefold():
                            Coordinates.append(_h.next_sibling.get_text())

                        if _summary_title.split(' ')[0] == 'Estimated'.casefold():
                            Sea_level.append(_h.next_sibling.get_text())



                for _row in municipality_summary('tr'):
                    _h = _row.find('th')
                    _summary_title = (_h.get_text()).lower()

                    if _summary_title == 'coastal/landlocked':
                        CoastalLandLocked.append(_h.next_sibling.get_text())

                    if _summary_title == 'marine waterbodies':
                        Marine.append(_h.next_sibling.get_text())


    #ialalgay na yung mga data sa dataframe
    df_output4 = pd.DataFrame()
    df_output4['Municipality Name'] = Municipality
    df_output4['Barangay Name'] = Brgy
    df_output4['Population percentage (2020)'] = Population_per
    df_output4['Population (2020)'] = Population_2020
    df_output4['Population (2015)'] = Population_2015
    df_output4['Change (2015‑2020)'] = Change_2015_2020
    df_output4['Annual Population Growth Rate (2015‑2020)'] = APGR_2015_2020

    if(len(Postal) != len(Brgy)):
        Postal = Postal[:len(Brgy)]
    df_output4['Postal Code'] = Postal
    df_output4['Coastal/Landlocked'] = CoastalLandLocked
    df_output4['Marine Waterbodies'] = Marine

    if(len(Coordinates) != len(Brgy)):
        Coordinates = Coordinates[:len(Brgy)]
    df_output4['Coordinates'] = Coordinates

    if(len(Sea_level) != len(Brgy)):
        Sea_level = Sea_level[:len(Brgy)]
    df_output4['Estimated Elevation above sea level'] = Sea_level

    output4 = output4.append(df_output4)

AttributeError: 'str' object has no attribute 'find_all'

In [140]:
output4
# output4.describe()

Unnamed: 0,Municipality Name,Barangay Name,Population percentage (2020),Population (2020),Population (2015),Change (2015‑2020),Annual Population Growth Rate (2015‑2020),Postal Code,Coastal/Landlocked,Marine Waterbodies,Coordinates,Estimated Elevation above sea level
0,Angeles,Agapito del Rosario,0.55%,2556,3230,-20.87%,-4.81%,2009.0,landlocked,none (landlocked),"15.1433, 120.5887 (15° 9' North, 120° 35' East)",103.1 meters (338.3 feet)
1,Angeles,Amsic,3.66%,16953,14379,17.90%,3.53%,2009.0,landlocked,none (landlocked),"15.1588, 120.5679 (15° 10' North, 120° 34' East)",121.7 meters (399.3 feet)
2,Angeles,Anunas,5.63%,26063,20911,24.64%,4.74%,2009.0,landlocked,none (landlocked),"15.1559, 120.5551 (15° 9' North, 120° 33' East)",138.6 meters (454.7 feet)
3,Angeles,Balibago,9.13%,42274,40087,5.46%,1.12%,2009.0,landlocked,none (landlocked),"15.1663, 120.5901 (15° 10' North, 120° 35' East)",103.4 meters (339.2 feet)
4,Angeles,Capaya,2.72%,12602,8870,42.07%,7.67%,2009.0,landlocked,none (landlocked),"15.1456, 120.6173 (15° 9' North, 120° 37' East)",73.2 meters (240.2 feet)
...,...,...,...,...,...,...,...,...,...,...,...,...
10,Don Marcelino,North Lamidan,4.47%,2036,2041,-0.24%,-0.05%,8013,coastal,Davao Gulf [Philippine Sea],"6.1011, 125.6912 (6° 6' North, 125° 41' East)",125.7 meters (412.4 feet)
11,Don Marcelino,Nueva Villa,4.57%,2079,2049,1.46%,0.31%,8013,coastal,Davao Gulf [Philippine Sea],"6.2003, 125.6524 (6° 12' North, 125° 39' East)","335.4 meters (1,100.4 feet)"
12,Don Marcelino,South Lamidan,3.63%,1654,1700,-2.71%,-0.58%,8013,coastal,Davao Gulf [Philippine Sea],"6.0883, 125.7031 (6° 5' North, 125° 42' East)",18.1 meters (59.4 feet)
13,Don Marcelino,Talagutong,15.25%,6945,6898,0.68%,0.14%,8013,coastal,Davao Gulf [Philippine Sea],"6.2615, 125.6661 (6° 16' North, 125° 40' East)",16.9 meters (55.4 feet)


In [141]:
# output4.to_csv(f'../Output/Output4.csv')

output4.to_csv(f'../Output/_Output4.csv')