In [9]:
# Libraries
import json
from pathlib import Path
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import os

In [10]:
# Set up Chrome options
chrome_options = Options()
chrome_options.add_argument("–disable-extensions")

In [11]:
# Wildfire Column headers merged with table configs
'''table_configs = {
    'Table_1': {
        'headers': ['Event_id', 'Episode', 'Countries', 'Population_Affected', 'Max_Wind_speed', 'MAX_Storm_Surge', 'Vulnerability', 'Event_summary'],
        'xpath': '//*[@id="alert_summary_left"]/table/tbody',
        'col_limit': 6
    },
    'Table_2': {
        'headers': ['Event_id', 'Episode', 'Alert', 'N', 'Date', 'Max_Winds', 'Population_in_tropical_storm', 'Population_in_Cat1_and_above', 'Countries'],
        'xpath': '//*[@id="impactTimeline"]/table/tbody',
        'col_limit': 7
    },
    'Table_3': {
        'headers': ['Event_id', 'Episode', 'Alert', 'N', 'Date', 'Category', 'Max_Winds', 'Population_in_Cat1_and_above', 'Population_in_Tropical_Storm_and_above', 'Lat/Long', 'Countries'],
        'xpath': '//*[@id="bulletinTimeline"]/table/tbody',
        'col_limit': 9
    },
    'Table_4': {
        'headers': ['Event_id', 'Episode', 'Countries'],
        'xpath': '//*[@id="countries"]/table/tbody',
        'col_limit': 1
    },
    'Table_5': {
        'headers': ['Event_id', 'Episode', 'Region_Province', 'Country'],
        'xpath': '//*[@id="provinces"]/table/tbody',
        'col_limit': 2
    },
    'Table_6': {
        'headers': ['Event_id', 'Episode', 'Name', 'Region_Province', 'Country', 'City_class'],
        'xpath': '//*[@id="cities"]/table/tbody',
        'col_limit': 4
    },
    'Table_7': {
        'headers': ['Event_id', 'Episode', 'Name', 'IATA_Code', 'Elevation_in_m', 'Usage', 'Runway_type', 'IFR', 'Runway_Length_in_ft'],
        'xpath': '//*[@id="airports"]/table/tbody',
        'col_limit': 7
    },
    'Table_8': {
        'headers': ['Event_id', 'Episode', 'Name', 'LOCODE', 'Country'],
        'xpath': '//*[@id="ports"]/table/tbody',
        'col_limit': 3
    },
    'Table_9': {
        'headers': ['Event_id', 'Episode', 'Reservoir', 'Dam_Name', 'River', 'Year'],
        'xpath': '//*[@id="dams"]/table/tbody',
        'col_limit': 4
    },
    'Table_10': {
        'headers': ['Event_id', 'Episode', 'Site', 'Type', 'Name', 'Country'],
        'xpath': '//*[@id="nuclear_npp"]/table/tbody',
        'col_limit': 5
    },
    'Table_11': {
        'headers': ['Event_id', 'Episode', 'Alert', 'Date', 'Name', 'Country', 'Storm_Surge_Height'],
        'xpath': '//*[@id="locations"]/table/tbody',
        'col_limit': 5
    }
}'''

# Load table configs from file
def load_table_configs(file_path):
    with open(file_path, 'r') as file:
        return json.load(file)


In [12]:
def create_csv(html_file):

    # File and event details
    file_name = html_file.split('/')[-1].split('.')[0]
    event_id, episode, event_type = file_name.split('_')[0:3]
    print(event_id, episode, event_type)

    # Load the table configs
    table_configs_file = f"{event_type}.json"
    table_configs = load_table_configs(table_configs_file)
    
    # Initialize the WebDriver
    driver = webdriver.Chrome(options=chrome_options)
    driver.get(html_file)

    # Process each table
    tables_data = []
    for key, config in table_configs.items():
        xpath = config['xpath']
        col_limit = config['col_limit']
        table_data = extract_table_data(driver, event_id, episode, xpath, col_limit)
        tables_data.append((key, table_data))  # Keep track of table names and data
    
    driver.quit()

    # Define CSV destination
    destination_folder = 'CSV2'
    absolute_folder_path = Path(destination_folder).resolve()

    # Generate CSV files
    for key, table_data in tables_data:
        csv_file_path = f'{absolute_folder_path}\{event_type}_{key}.csv'
        file_exists = os.path.exists(csv_file_path)
        header = table_configs[key]['headers']  # Fetch the headers from the loaded structure
        print(key, header)
        print(table_data)
        df = pd.DataFrame(table_data, columns=header)
        df.head()
        df.to_csv(csv_file_path, mode='a' if file_exists else 'w', header=not file_exists, index=False)
        print(f"Data {'appended to' if file_exists else 'written to'} {csv_file_path}")
    
    

In [None]:
def create_csv(html_file):

    # File and event details
    file_name = html_file.split('/')[-1].split('.')[0]
    event_id, episode, event_type = file_name.split('_')[0:3]
    print(event_id, episode, event_type)

    # Load the table configs
    table_configs_file = f"{event_type}.json"
    table_configs = load_table_configs(table_configs_file)
    
    # Initialize the WebDriver
    driver = webdriver.Chrome(options=chrome_options)
    driver.get(html_file)

    # Process each table
    tables_data = []
    for key, config in table_configs.items():
        xpath = config['xpath']
        col_limit = config['col_limit']
        table_data = extract_table_data(driver, event_id, episode, xpath, col_limit)
        tables_data.append((key, table_data))  # Keep track of table names and data
    
    driver.quit()

    # Define CSV destination
    destination_folder = 'CSV2'
    absolute_folder_path = Path(destination_folder).resolve()
    os.makedirs(absolute_folder_path, exist_ok=True)

    # Generate CSV files
    for key, table_data in tables_data:
        csv_file_path = f'{absolute_folder_path}\{event_type}_{key}.csv'
        file_exists = os.path.exists(csv_file_path)
        header = table_configs[key]['headers']  # Fetch the headers from the loaded structure
        print(key, header)
        print(table_data)
        df = pd.DataFrame(table_data, columns=header)
        df.head()
        df.to_csv(csv_file_path, mode='a' if file_exists else 'w', header=not file_exists, index=False)
        print(f"Data {'appended to' if file_exists else 'written to'} {csv_file_path}")
    
    # Create infoFile
    info_file_path = f'{absolute_folder_path}\main.csv'
    file_exists = os.path.exists(info_file_path)
    header = ["event_id","episode_id" ,"impact_url","event_type"]
    df = pd.DataFrame(html_info, columns=header)
    df.head()
    df.to_csv(info_file_path, mode='a' if file_exists else 'w', header=not file_exists, index=False)
    print(f"Data {'appended to' if file_exists else 'written to'} {info_file_path}")

In [13]:

# Main function
def create_csv1(html_file):
    # File and event details
    file_name = html_file.split('/')[-1].split('.')[0]
    event_id, episode, event_type = file_name.split('_')[0:3]
    print(event_id, episode, event_type)

    # Initialize the WebDriver
    driver = webdriver.Chrome(options=chrome_options)
    driver.get(html_file)

    # Process each table
    tables_data = []
    for key, config in table_configs.items():
        xpath = config['xpath']
        col_limit = config['col_limit']
        table_data = extract_table_data(driver, event_id, episode, xpath, col_limit)
        tables_data.append((key, table_data))  # Keep track of table names and data
    
    driver.quit()

    # Define CSV destination
    destination_folder = 'CSV2'
    absolute_folder_path = Path(destination_folder).resolve()
    # Generate CSV files
    for key, table_data in tables_data:
        csv_file_path = f'{absolute_folder_path}\{event_type}_{key}.csv'
        file_exists = os.path.exists(csv_file_path)
        header = table_configs[key]['headers']  # Fetch the headers from the merged structure
        print(key, header)
        print(table_data)
        df = pd.DataFrame(table_data, columns=header)
        df.head()
        df.to_csv(csv_file_path, mode='a' if file_exists else 'w', header=not file_exists, index=False)
        print(f"Data {'appended to' if file_exists else 'written to'} {csv_file_path}")


In [14]:

# Function to extract table data based on XPath and the number of columns required
def extract_table_data(driver, event_id, episode, xpath, col_limit):
    table_data = []
    if xpath == "//*[@id='alert_summary_left']/table/tbody" or xpath == "//*[@id='tab_responsive']/table/tbody" :
        try:
            temp_tbl=[]
            table = driver.find_element(By.XPATH, xpath)
            rows = table.find_elements(By.XPATH, './/tr')
            if xpath == "//*[@id='alert_summary_left']/table/tbody":
                event_summary = "//*[@id='alert_summary_left']/p"
            else :
                if xpath == "//*[@id='tab_responsive']/table/tbody":
                    event_summary = "//*[@class='p_summary'][1]"
            event_summary_text = driver.find_element(By.XPATH, event_summary).text
            for row in rows:
                columns = row.find_elements(By.XPATH, './/td')
                row_data = [column.text for column in columns]
                if len(row_data)>=1:
                    temp_tbl.append(row_data)
            table_data = [[event_id, episode,*[temp_tbl[i][1] for i in range(1,col_limit)]
                        ,event_summary_text]]
        except NoSuchElementException:
            print(f"Event summary not founds or table with XPath '{xpath}' not found.")
        return table_data
    else:
        try:
            table = driver.find_element(By.XPATH, xpath)
            rows = table.find_elements(By.XPATH, './/tr')
            for row in rows:
                columns = row.find_elements(By.XPATH, './/td' if col_limit else './/th | .//td')
                row_data = []
                for col in columns:
                    cell_text = col.text.strip()

                    # If the cell is empty, look for an image
                    if not cell_text:
                        img_element = col.find_element(By.TAG_NAME, 'img') if col.find_elements(By.TAG_NAME, 'img') else None
                        if img_element:
                            # Try to get the title attribute
                            image_title = img_element.get_attribute('title')
                            if image_title:
                                row_data.append(image_title)  # Add the image title if present
                            else:
                                # Try to get the alt attribute if title is absent
                                alt_text = img_element.get_attribute('alt')
                                if alt_text:
                                    row_data.append(alt_text)  # Add the alt text if present
                                else:
                                    # If no title or alt, extract from the image URL
                                    image_url = img_element.get_attribute('src')
                                    image_code = image_url.split('/')[-1].split('.')[0]  # Extract "8p" from URL
                                    row_data.append(image_code)  # Add the extracted part from the URL
                        else:
                            row_data.append("")  # Leave it blank if no image is founD
                    else:
                        # If the cell contains text, just add it
                        row_data.append(cell_text) 

                if len(row_data) >= 1:
                    table_data.append([event_id, episode] + row_data[:col_limit if col_limit else len(row_data)])
        except NoSuchElementException:
            print(f"Table with XPath '{xpath}' not found.")
    return table_data

In [15]:
def list_filenames(folder_path):
    try:
        # Get a list of all files in the specified folder
        filenames = os.listdir(folder_path)
        
        return filenames
    
    except FileNotFoundError:
        print(f"Folder not found: {folder_path}")
    except Exception as e:
        print(f"An error occurred: {e}")

def main():
    folder_path = "html_files/Cyclones"
    absolute_folder_path = Path(folder_path).resolve()
    filenames = list_filenames(absolute_folder_path)

    for file in filenames:
        absolute_file_path = absolute_folder_path / file
        create_csv(f"file:///{absolute_file_path.as_posix()}")


In [None]:
if __name__ == "__main__":
    main()

1001052 20 Cyclones
Table with XPath '//*[@id='dams']/table/tbody' not found.
Table with XPath '//*[@id='nuclear_npp']/table/tbody' not found.
Table_1 ['Event_id', 'Episode', 'Countries', 'Population_Affected', 'Max_Wind_speed', 'MAX_Storm_Surge', 'Vulnerability', 'Event_summary']
[['1001052', '20', 'Miscellaneous (French) Indian Ocean Islands, Mauritius', 'No people in Category 1 or higher', '116 km/h', '0.1 m (22 Feb 07:00 UTC)', 'Medium (Mauritius)', 'Tropical Cyclone ELEANOR-24 can have a low humanitarian impact based on the maximum sustained wind speed,exposed population and vulnerability.']]
Data written to D:\Web Scraping\Web-Scraping\CSV2\Cyclones_Table_1.csv
Table_2 ['Event_id', 'Episode', 'Alert', 'N', 'Date', 'Max_Winds', 'Population_in_tropical_storm', 'Population_in_Cat1_and_above', 'Countries']
[['1001052', '20', 'Orange', '1', '19 Feb 2024 00:00', '158', '2.2 million', '1.3 million', 'Mauritius'], ['1001052', '20', 'Orange', '2', '19 Feb 2024 06:00', '158', '2.2 million'