In [31]:
# Libraries
from pathlib import Path
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import os

In [32]:
# Set up Chrome options
chrome_options = Options()
chrome_options.add_argument("–disable-extensions")

In [33]:
# Wildfire Column headers
table_1_headers  = ['Event_id','Episode','Countries','Population_Affected','Max_Wind_speed','MAX_Storm_Surge','Vulnerability','Event_summary']
table_2_headers  = ['Event_id','Episode','Alert','N','Date','Max_Winds','Population_in_tropical_storm','Population_in_Cat1_and_above','Countries']
table_3_headers  = ['Event_id','Episode','Alert','N','Date','Category','Max_Winds','Population_in_Cat1_and_above', 'Population_in_Tropical_Storm_and_above','Lat/Long', 'Countries']
table_4_headers  = ['Event_id','Episode','Countries']
table_5_headers  = ['Event_id','Episode','Region_Province','Country']
table_6_headers  = ['Event_id','Episode','Name','Region_Province','Country','City_class']
table_7_headers  = ['Event_id','Episode','Name','IATA_Code','Elevation_in_m','Usage','Runway_type','IFR','Runway_Length_in_ft']
table_8_headers  = ['Event_id','Episode','Name','LOCODE','Country']
table_9_headers  = ['Event_id','Episode','Reservoir','Dam_Name','River','Year']
table_10_headers = ['Event_id','Episode','Site','Type','Name','Country']
table_11_headers = ['Event_id','Episode','Alert','Date ','Name','Country','Storm_Surge_Height']


# Main function
def create_csv(html_file):
    # File and event details
    file_name = html_file.split('/')[-1].split('.')[0]
    event_id, episode, event_type = file_name.split('_')[0:3]
    print(event_id, episode, event_type)

    # Initialize the WebDriver
    driver = webdriver.Chrome(options=chrome_options)
    driver.get(html_file)


    # Define table configurations (XPaths and column limits)
    table_configs = [
        ('//*[@id="alert_summary_left"]/table/tbody', 6),
        ('//*[@id="impactTimeline"]/table/tbody', 7),
        ('//*[@id="bulletinTimeline"]/table/tbody', 9),
        ('//*[@id="countries"]/table/tbody', 1),
        ('//*[@id="provinces"]/table/tbody', 2),
        ('//*[@id="cities"]/table/tbody', 4),
        ('//*[@id="airports"]/table/tbody', 7),
        ('//*[@id="ports"]/table/tbody', 3),
        ('//*[@id="dams"]/table/tbody', 4),
        ('//*[@id="nuclear_npp"]/table/tbody', 5),
        ('//*[@id="locations"]/table/tbody', 5)
    ]

    # Process each table
    tables_data = []
    for i, (xpath, col_limit) in enumerate(table_configs):
        table_data = extract_table_data(driver, event_id, episode, xpath, col_limit)
        tables_data.append(table_data)
    for row in tables_data:
        print(row)
    driver.quit()

    # Define CSV destination
    destination_folder = 'CSV'
    absolute_folder_path = Path(destination_folder).resolve()

    # Generate CSV files
    for i, table_data in enumerate(tables_data,start=1):
        csv_file_path = f'{absolute_folder_path}\{event_type}_{i}.csv'
        file_exists = os.path.exists(csv_file_path)
        header = eval(f"table_{i}_headers")
        print(i,header)
        print(table_data)
        df = pd.DataFrame(table_data,columns=header)
        df.head()
        df.to_csv(csv_file_path, mode='a' if file_exists else 'w', header=not file_exists, index=False)
        print(f"Data {'appended to' if file_exists else 'written to'} {csv_file_path}")



In [34]:

# Function to extract table data based on XPath and the number of columns required
def extract_table_data(driver, event_id, episode, xpath, col_limit):
    table_data = []
    if xpath == '//*[@id="alert_summary_left"]/table/tbody' or xpath == '//*[@id="tab_responsive"]/table/tbody' :
        try:
            temp_tbl=[]
            table = driver.find_element(By.XPATH, xpath)
            rows = table.find_elements(By.XPATH, './/tr')
            if xpath == '//*[@id="alert_summary_left"]/table/tbody':
                event_summary = '//*[@id="alert_summary_left"]/p'
            else :
                if xpath == '//*[@id="tab_responsive"]/table/tbody':
                    event_summary = '//*[@class="p_summary"][1]'
            event_summary_text = driver.find_element(By.XPATH, event_summary).text
            for row in rows:
                columns = row.find_elements(By.XPATH, './/td')
                row_data = [column.text for column in columns]
                if len(row_data)>=1:
                    temp_tbl.append(row_data)
            table_data = [[event_id, episode,*[temp_tbl[i][1] for i in range(1,col_limit)]
                        ,event_summary_text]]
        except NoSuchElementException:
            print(f"Event summary not founds or table with XPath '{xpath}' not found.")
        return table_data
    else:
        try:
            table = driver.find_element(By.XPATH, xpath)
            rows = table.find_elements(By.XPATH, './/tr')
            for row in rows:
                columns = row.find_elements(By.XPATH, './/td' if col_limit else './/th | .//td')
                row_data = []
                for col in columns:
                    cell_text = col.text.strip()

                    # If the cell is empty, look for an image
                    if not cell_text:
                        img_element = col.find_element(By.TAG_NAME, 'img') if col.find_elements(By.TAG_NAME, 'img') else None
                        if img_element:
                            # Try to get the title attribute
                            image_title = img_element.get_attribute('title')
                            if image_title:
                                row_data.append(image_title)  # Add the image title if present
                            else:
                                # Try to get the alt attribute if title is absent
                                alt_text = img_element.get_attribute('alt')
                                if alt_text:
                                    row_data.append(alt_text)  # Add the alt text if present
                                else:
                                    # If no title or alt, extract from the image URL
                                    image_url = img_element.get_attribute('src')
                                    image_code = image_url.split('/')[-1].split('.')[0]  # Extract "8p" from URL
                                    row_data.append(image_code)  # Add the extracted part from the URL
                        else:
                            row_data.append("")  # Leave it blank if no image is founD
                    else:
                        # If the cell contains text, just add it
                        row_data.append(cell_text) 

                if len(row_data) >= 1:
                    table_data.append([event_id, episode] + row_data[:col_limit if col_limit else len(row_data)])
        except NoSuchElementException:
            print(f"Table with XPath '{xpath}' not found.")
    return table_data

In [35]:
def list_filenames(folder_path):
    try:
        # Get a list of all files in the specified folder
        filenames = os.listdir(folder_path)
        
        return filenames
    
    except FileNotFoundError:
        print(f"Folder not found: {folder_path}")
    except Exception as e:
        print(f"An error occurred: {e}")

def main():
    folder_path = "html_files/Cyclones"
    absolute_folder_path = Path(folder_path).resolve()
    filenames = list_filenames(absolute_folder_path)

    for file in filenames:
        absolute_file_path = absolute_folder_path / file
        create_csv(f"file:///{absolute_file_path.as_posix()}")


In [36]:
if __name__ == "__main__":
    main()

1001052 20 Cyclones
Table with XPath '//*[@id="dams"]/table/tbody' not found.
Table with XPath '//*[@id="nuclear_npp"]/table/tbody' not found.
[['1001052', '20', 'Miscellaneous (French) Indian Ocean Islands, Mauritius', 'No people in Category 1 or higher', '116 km/h', '0.1 m (22 Feb 07:00 UTC)', 'Medium (Mauritius)', 'Tropical Cyclone ELEANOR-24 can have a low humanitarian impact based on the maximum sustained wind speed,exposed population and vulnerability.']]
[['1001052', '20', 'Orange', '1', '19 Feb 2024 00:00', '158', '2.2 million', '1.3 million', 'Mauritius'], ['1001052', '20', 'Orange', '2', '19 Feb 2024 06:00', '158', '2.2 million', '1.3 million', 'Mauritius'], ['1001052', '20', 'Orange', '3', '19 Feb 2024 12:00', '148', '2.2 million', '1.3 million', 'Mauritius'], ['1001052', '20', 'Green', '4', '19 Feb 2024 18:00', '74', 'No people', 'No people', ''], ['1001052', '20', 'Orange', '5', '20 Feb 2024 00:00', '137', '2.2 million', '1.2 million', 'Mauritius'], ['1001052', '20', 'Gree

Exception ignored in: <function Service.__del__ at 0x000001D14810B760>
Traceback (most recent call last):
  File "c:\Users\CZ0222\AppData\Local\anaconda3\lib\site-packages\selenium\webdriver\common\service.py", line 189, in __del__
    self.stop()
  File "c:\Users\CZ0222\AppData\Local\anaconda3\lib\site-packages\selenium\webdriver\common\service.py", line 146, in stop
    self.send_remote_shutdown_command()
  File "c:\Users\CZ0222\AppData\Local\anaconda3\lib\site-packages\selenium\webdriver\common\service.py", line 126, in send_remote_shutdown_command
    request.urlopen(f"{self.service_url}/shutdown")
  File "c:\Users\CZ0222\AppData\Local\anaconda3\lib\urllib\request.py", line 216, in urlopen
    return opener.open(url, data, timeout)
  File "c:\Users\CZ0222\AppData\Local\anaconda3\lib\urllib\request.py", line 519, in open
    response = self._open(req, data)
  File "c:\Users\CZ0222\AppData\Local\anaconda3\lib\urllib\request.py", line 536, in _open
    result = self._call_chain(self.h

1001053 20 Cyclones
Table with XPath '//*[@id="countries"]/table/tbody' not found.
Table with XPath '//*[@id="provinces"]/table/tbody' not found.
Table with XPath '//*[@id="cities"]/table/tbody' not found.
Table with XPath '//*[@id="airports"]/table/tbody' not found.
Table with XPath '//*[@id="ports"]/table/tbody' not found.
Table with XPath '//*[@id="dams"]/table/tbody' not found.
Table with XPath '//*[@id="nuclear_npp"]/table/tbody' not found.
Table with XPath '//*[@id="locations"]/table/tbody' not found.
[['1001053', '20', 'TC-2024-000032-MOZ', 'Mozambique, South Africa, Eswatini, Zimbabwe, Malawi', 'No people in Category 1 or higher', '158 km/h', 'n.a.', 'Tropical Cyclone FILIPO-24 can have a medium humanitarian impact based on the maximum sustained wind speed,exposed population and vulnerability.']]
[['1001053', '20', 'Green', '1', '03 Mar 2024 06:00', '63', 'No people', 'No people', ''], ['1001053', '20', 'Green', '2', '03 Mar 2024 12:00', '116', '3 million', 'No people', 'Madaga

TypeError: 'NoneType' object is not iterable