In [42]:
from bs4 import BeautifulSoup
import requests
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
import pandas as pd
pd.set_option("display.max_rows", None, "display.max_columns", None)
import re

In [43]:
url = "https://earthquake.phivolcs.dost.gov.ph/"

In [44]:
page = requests.get(url, verify=False)

In [45]:
soup = BeautifulSoup(page.text, 'html')

In [46]:
def is_row_valid(row):

  if row == None:
    return False

  if len(row) < 70:
    return False

  invalid_starts = ["PHIVOLCS", "JAN", "FEB", "MAR", "APR", "MAY", "JUN", "JUL", "AUG", "SEP", "OCT", "NOV", "DEC", "Jan", "Date", "199", "200", "201", "202", "203", "Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]
  invalid_phrases = ["SEISMICITY MAPS", "PHIVOLCS LATEST", "Seismological Observation"]

  for word in invalid_starts:
    if row.startswith(word):
      return False

  for phrase in invalid_phrases:
    if phrase in row:
      return False

  return True

In [47]:
def extract_data(input_str):
    # Regular expression patterns for the different parts
    datetime_pattern = r"^\d{2} \w+ \d{4} - \d{2}:\d{2} [AP]M"
    float_two_decimal_pattern = r"(\d{2}\.\d{2})"
    float_three_decimal_pattern = r"(\d{3}\.\d{2})"
    whole_number_pattern = r"(\d{3})"
    float_one_decimal_pattern = r"(\d\.\d)"
    remainder_pattern = r"\d{3}\s+\w+.*"

    # Extract datetime
    datetime_match = re.match(datetime_pattern, input_str)
    output1 = datetime_match.group(0) if datetime_match else ""

    # Remove the datetime part from the string for further processing
    remainder_str = input_str[len(output1):].strip()

    # Extract floating numbers with two decimal places
    float_two_decimal_match = re.match(float_two_decimal_pattern, remainder_str)
    output2 = float_two_decimal_match.group(0) if float_two_decimal_match else ""

    # Remove the float_two_decimal part from the string
    remainder_str = remainder_str[len(output2):].strip()

    # Extract floating numbers with three decimal places
    float_three_decimal_match = re.match(float_three_decimal_pattern, remainder_str)
    output3 = float_three_decimal_match.group(0) if float_three_decimal_match else ""

    # Remove the float_three_decimal part from the string
    remainder_str = remainder_str[len(output3):].strip()

    # Extract whole number
    whole_number_match = re.match(whole_number_pattern, remainder_str)
    output4 = whole_number_match.group(0) if whole_number_match else ""

    # Remove the whole number part from the string
    remainder_str = remainder_str[len(output4):].strip()

    # Extract floating numbers with one decimal place
    float_one_decimal_match = re.match(float_one_decimal_pattern, remainder_str)
    output5 = float_one_decimal_match.group(0) if float_one_decimal_match else ""

    # Remove the float_one_decimal part from the string
    remainder_str = remainder_str[len(output5):].strip()

    # The remainder is the final output
    output6 = remainder_str

    return output1, output2, output3, output4, output5, output6

In [48]:
rows = soup.find_all('tr')
datetimes = []
latitudes = []
longitudes = []
depths = []
magnitudes = []
locations = []
for i in rows:
  row = i.text.replace('\n', '').replace('\t', '')
  if is_row_valid(row):
    extracted_row = extract_data(row)
    datetimes.append(extracted_row[0])
    latitudes.append(extracted_row[1])
    longitudes.append(extracted_row[2])
    depths.append(extracted_row[3])
    magnitudes.append(extracted_row[4])
    locations.append(extracted_row[5])

dict_ = {
    'Datetime': [datetime for datetime in datetimes],
    'Latitude': [latitude for latitude in latitudes],
    'Longitude': [longitude for longitude in longitudes],
    'Depth': [depth for depth in depths],
    'Magnitude': [magnitude for magnitude in magnitudes],
    'Location': [location for location in locations]
}
df = pd.DataFrame(dict_)
df.head(2)

Unnamed: 0,Datetime,Latitude,Longitude,Depth,Magnitude,Location
0,15 May 2025 - 05:49 PM,6.26,126.72,35,2.2,084 km S 59Â° E of Governor Generoso (Davao O...
1,15 May 2025 - 05:08 PM,10.61,126.68,45,2.5,093 km N 45Â° E of Burgos (Surigao Del Norte)


In [49]:
'''
https://earthquake.phivolcs.dost.gov.ph/EQLatest-Monthly/2018/2018_December.html
https://earthquake.phivolcs.dost.gov.ph/EQLatest-Monthly/2023/2023_February.html
https://earthquake.phivolcs.dost.gov.ph/EQLatest-Monthly/{year}/{year}_{month}.html
'''

years = [2018, 2019, 2020, 2021, 2022, 2023, 2024, 2025]
years.reverse()
months = ["January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December"]
months.reverse()

dict_history = {
    'Datetime': [],
    'Latitude': [],
    'Longitude': [],
    'Depth': [],
    'Magnitude': [],
    'Location': []
}
df_history = pd.DataFrame(dict_history)

for year in years:
  for month in months:
    url_history = f"https://earthquake.phivolcs.dost.gov.ph/EQLatest-Monthly/{year}/{year}_{month}.html"
    page_history = requests.get(url_history, verify=False)

    if str(requests.get(url_history, verify=False)) == "<Response [200]>":
      soup_history = BeautifulSoup(page_history.text, 'html')

      rows_portion = soup_history.find_all('tr')

      datetimes_portion = []
      latitudes_portion = []
      longitudes_portion = []
      depths_portion = []
      magnitudes_portion = []
      locations_portion = []

      for i in rows_portion:
        row = i.text.replace('\n', '').replace('\t', '').replace('  ', '').replace('\r', '').replace('<', '').replace('>', '')

        try:
          if is_row_valid(row):
            extracted_row = extract_data(row)
            datetimes_portion.append(extracted_row[0])
            latitudes_portion.append(extracted_row[1])
            longitudes_portion.append(extracted_row[2])
            depths_portion.append(extracted_row[3])
            magnitudes_portion.append(extracted_row[4])
            locations_portion.append(extracted_row[5])
            # print(f"valid row: {row}")
        except:
          print(f"ROW INVALID: {row}")

      dict_portion = {
        'Datetime': datetimes_portion,
        'Latitude': latitudes_portion,
        'Longitude': longitudes_portion,
        'Depth': depths_portion,
        'Magnitude': magnitudes_portion,
        'Location': locations_portion
      }
      df_portion = pd.DataFrame(dict_portion)
      df_history = pd.concat([df_history, df_portion])

df_history.head(10)

Unnamed: 0,Datetime,Latitude,Longitude,Depth,Magnitude,Location
0,30 April 2025 - 11:53 PM,4.47,126.36,19,4.4,147km S 45Â° E of Balut Island (Municipality O...
1,30 April 2025 - 11:48 PM,9.94,126.07,33,2.0,006km S 31Â° E of San Benito (Surigao Del Norte)
2,30 April 2025 - 10:44 PM,7.43,127.24,23,3.0,076km N 82Â° E of Caraga (Davao Oriental)
3,30 April 2025 - 08:49 PM,9.43,122.18,24,2.6,037km S 59Â° W of Hinoba-an (Negros Occidental)
4,30 April 2025 - 08:10 PM,8.97,126.11,71,2.5,009km N 74Â° E of San Miguel (Surigao Del Sur)
5,30 April 2025 - 07:56 PM,11.61,124.5,26,3.0,007km S 43Â° W of Culaba (Biliran)
6,30 April 2025 - 07:32 PM,10.83,122.94,32,1.0,005km N 51Â° W of City Of Silay (Negros Occide...
7,30 April 2025 - 07:04 PM,12.72,124.87,8,2.6,018km N 20Â° W of Pambujan (Northern Samar)
8,30 April 2025 - 06:24 PM,2.66,126.81,148,4.4,341km S 27Â° E of Balut Island (Municipality O...
9,30 April 2025 - 05:03 PM,8.37,125.88,2,2.4,014km S 83Â° W of Rosario (Agusan Del Sur)


In [50]:
df_history.to_csv(f"earthquake_ph_list_{years[0]}.csv", index=False)