### HELPER METHOD

In [6]:
# FILE WRITER
from pathlib import Path
# NOTE: To use the method, the argument must have the same name as the 
# method that it is calling. Check all the methods to know what arguments to pass
# Replace will overwrite existing file with newly scraped data

def create_entity_file(output_file_name, replace=False):
   path = Path(f'../res/raw/{output_file_name}.txt')
   if not path.is_file() and not replace:
      entities_list = globals()[f'scrape_{output_file_name}_list']()
      file = open(f'../res/raw/{output_file_name}.txt', "x")
      for x in entities_list:
         file.write(f'{x}\n')

   elif path.is_file and replace:
      entities_list = globals()[f'scrape_{output_file_name}_list']()
      file = open(f'../res/raw/{output_file_name}.txt', "w", encoding='utf-8')
      for x in entities_list:
         file.write(f'{x}\n')

   print(f'DONE CREATING {output_file_name} FILE')



In [7]:
# IMPORT PACKAGES
import requests
import pandas as pd
from IPython.display import display
from bs4 import BeautifulSoup, NavigableString
import requests
from selenium import webdriver


### Get Local Tourist Destinations

In [5]:


def scrape_tourist_dests_list():
   tourist_dest_url = 'https://www.traveling-up.com/travel-guide-81-provinces-of-the-philippines'

   driver = webdriver.Firefox(executable_path='geckodriver.exe')
   driver.get(tourist_dest_url)
   html = driver.page_source

   data = BeautifulSoup(html,'lxml')
   ul_list= data.find("div",{"class":"entry-content"}).find_all("ul")

   tourist_dest_list = []
   for ul in ul_list:
      li_children = ul.findChildren("li", recursive=False)
      for li in li_children:
         if type(li) == NavigableString:
            continue
         elif 'Top tourist spots:' in li.text:
            # Remove top tourist spots from the text and get the 2nd item which is 
            # texts with tourist destinations that are separated by comma
            cleaned_tourist_dest = li.text.split('Top tourist spots:')[1]
            # Split the text to get items as list
            cleaned_tourist_dest = cleaned_tourist_dest.split(',')
            # Get only the data that comes before a parentheses
            # Get the lower case 
            # Remove 'xa0' character using space
            cleaned_tourist_dest = [dest.split('(')[0]
                                    .lower()
                                    .strip()
                                    .replace(u'\xa0',' ')
                                    for dest in cleaned_tourist_dest]
            # Remove empty strings from the list
            cleaned_tourist_dest = list(filter(None, cleaned_tourist_dest))
            # Finally append values to the tourist_dest_list
            tourist_dest_list = tourist_dest_list + cleaned_tourist_dest
   print('DONE SCRAPING...')
   return tourist_dest_list

In [52]:
create_entity_file('tourist_dests', replace=True)


  driver = webdriver.Firefox(executable_path='geckodriver.exe')


DONE SCRAPING...
DONE CREATING tourist_dests FILE


### Get Surnames

In [None]:

def scrape_surnames_list():
    url = 'https://baguiocityguide.com/how-common-is-your-last-name-here-are-the-top-1000-filipino-surnames'

    header = {
        "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.75 Safari/537.36",
        "X-Requested-With": "XMLHttpRequest"
    }

    r = requests.get(url, headers=header)
    dfs = pd.read_html(r.text)

    surnames_list = []
    for table in dfs:
        surnames = [surname.lower() for surname in table['Surname'].to_list()]
        surnames_list = surnames_list + surnames
    return surnames_list

In [None]:
create_entity_file('surnames', replace=True)


DONE CREATING surnames FILE


### Get First names

In [None]:

def scrape_first_names_list():
    url = 'https://forebears.io/philippines/forenames'

    header = {
        "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.75 Safari/537.36",
        "X-Requested-With": "XMLHttpRequest"
    }

    r = requests.get(url, headers=header)
    dfs = pd.read_html(r.text)

    firstnames_list = []
    for table in dfs:
        first_names = [first_name.lower() for first_name in table['Forename'].to_list()]
        firstnames_list = firstnames_list + first_names
    return firstnames_list


In [None]:
create_entity_file('first_names', replace=True)


DONE CREATING first_names FILE


### Get Local Events 

In [None]:

def scrape_events_list():
   url = 'https://www.tpb.gov.ph/tpb-calendar-of-promotions-and-marketing-activities/calendar-of-philippine-festivals-and-monthly-observances-theme/'

   header = {
        "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.75 Safari/537.36",
        "X-Requested-With": "XMLHttpRequest"
    }

   r = requests.get(url, headers=header)
   dfs = pd.read_html(r.text)
   
   events_list = []
   months = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']
   for table in dfs:
      for event in table['Name of Event'].to_list():
         if event.lower() not in map(str.lower, months):
            events_list.append(event.lower())               

   return events_list

In [None]:
create_entity_file('events', replace=True)
# create_entity_file('events', replace=False)

DONE CREATING events FILE


### GET local companies

In [43]:

def scrape_local_companies_list():
    url = 'https://en.wikipedia.org/wiki/List_of_companies_of_the_Philippines'

    header = {
        "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.75 Safari/537.36",
        "X-Requested-With": "XMLHttpRequest"
    }

    r = requests.get(url, headers=header)
    dfs = pd.read_html(r.text)

    local_companies_list = []
    for idx, table in enumerate(dfs):
        # Skip first table
        if idx==0:
            continue
        
        company_names = [company_name.lower()
                       for company_name in table['Name'].to_list()]
        local_companies_list = local_companies_list + company_names
    return local_companies_list


In [44]:
create_entity_file('local_companies', replace=True)


DONE CREATING local_companies FILE


### Get NGOS and Government Organizations

In [45]:

def scrape_organizations_list():
   orgs_list = []

   for i in range(3):
      
      url = f'https://worldjusticeproject.org/resource-hub/leading-organizations?factor=All&geography=174&name=&order=field_organization_name_trans&sort=asc&page={i}'

      header = {
         "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.75 Safari/537.36",
         "X-Requested-With": "XMLHttpRequest"
      }

      r = requests.get(url, headers=header)
      dfs = pd.read_html(r.text)
      
      
      for table in dfs:
         for org in table['Organization Name'].to_list():
            orgs_list.append(org.lower().strip())

   return orgs_list
try:
   scrape_organizations_list()
except Exception as e:
   print(f'{e}')


In [48]:
create_entity_file('organizations',replace=True)

DONE CREATING organizations FILE


### Get government agencies

In [53]:
import math 
def scrape_gov_agencies_list():
    url = 'https://www.officialgazette.gov.ph/lists/government-websites/'

    header = {
        "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.75 Safari/537.36",
        "X-Requested-With": "XMLHttpRequest"
    }

    r = requests.get(url, headers=header)
    dfs = pd.read_html(r.text)

    gov_agencies_list = []
    for gov_agency in dfs[0][0].to_list():
        gov_agency = str(gov_agency)
        if gov_agency != None and gov_agency != 'nan':
            gov_agencies_list.append(gov_agency.lower())

    return gov_agencies_list


In [50]:
create_entity_file('gov_agencies',replace=True)

DONE CREATING gov_agencies FILE


## Get local holidays

In [13]:

def scrape_holidays_list():
    url = 'https://www.officeholidays.com/countries/philippines/2022'

    header = {
        "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.75 Safari/537.36",
        "X-Requested-With": "XMLHttpRequest"
    }

    r = requests.get(url, headers=header)
    dfs = pd.read_html(r.text)

    holidays_list = []
    for table in dfs:
        display(table)
        
        holidays = [holidays.lower()
                         for holidays in table['Holiday Name'].to_list()]
        holidays_list = holidays_list + holidays
    
    return holidays_list



In [15]:
create_entity_file('holidays', replace=True)
