In [16]:
# Libraries
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import os

In [17]:
# Set up Chrome options
chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("–disable-extensions")

In [18]:
# Wildfire Column headers]
table_0_headers = ['col_1', 'col_2', 'col_3', 'col_4']
table_1_headers = ['col_1', 'col_2', 'col_3', 'col_4', 'col_5', 'col_6', 'col_7', 'col_8', 'col_9', 'col_10']
table_2_headers = ['col_1', 'col_2', 'col_3', 'col_4', 'col_5', 'col_6', 'col_7', 'col_8', 'col_9', 'col_10', 'col_11', 'col_12']
table_3_headers = ['col_1', 'col_2', 'col_3', 'col_4']
table_4_headers = ['col_1', 'col_2', 'col_3', 'col_4', 'col_5', 'col_6', 'col_7', 'col_8', 'col_9', 'col_10', 'col_11', 'col_12']
table_5_headers = ['col_1', 'col_2', 'col_3', 'col_4', 'col_5', 'col_6']
table_6_headers = ['col_1', 'col_2', 'col_3', 'col_4', 'col_5', 'col_6', 'col_7', 'col_8', 'col_9']
table_7_headers = ['col_1', 'col_2', 'col_3', 'col_4', 'col_5', 'col_6', 'col_7', 'col_8', 'col_9', 'col_10', 'col_11']
table_8_headers = ['col_1', 'col_2', 'col_3', 'col_4', 'col_5', 'col_6', 'col_7']
table_9_headers = ['col_1', 'col_2', 'col_3', 'col_4', 'col_5', 'col_6', 'col_7', 'col_8']
table_10_headers = ['col_1', 'col_2', 'col_3', 'col_4', 'col_5', 'col_6', 'col_7']

# Function to extract table data based on XPath and the number of columns required
def extract_table_data(driver, event_id, episode, xpath, col_limit):
    table_data = []
    if xpath == '//*[@id="alert_summary_left"]/table/tbody':
        try:
            temp_tbl=[]
            table = driver.find_element(By.XPATH, xpath)
            rows = table.find_elements(By.XPATH, './/tr')
            event_summary_text = driver.find_element(By.XPATH, '//*[@class="p_summary"][1]').text
            for row in rows:
                columns = row.find_elements(By.XPATH, './/td')
                row_data = [column.text for column in columns]
                if len(row_data)>=1:
                    temp_tbl.append(row_data)
            table_data = [[event_id, episode,*[temp_tbl[i][1] for i in range(1,col_limit)]
                        ,event_summary_text]]
        except NoSuchElementException:
            print(f"Event summary not founds or table with XPath '{xpath}' not found.")
        return table_data
    else:
        try:
            table = driver.find_element(By.XPATH, xpath)
            rows = table.find_elements(By.XPATH, './/tr')
            for row in rows:
                columns = row.find_elements(By.XPATH, './/td' if col_limit else './/th | .//td')
                row_data = []
                for col in columns:
                    cell_text = col.text.strip()

                    # If the cell is empty, look for an image
                    if not cell_text:
                        img_element = col.find_element(By.TAG_NAME, 'img') if col.find_elements(By.TAG_NAME, 'img') else None
                        if img_element:
                            # Try to get the title attribute
                            image_title = img_element.get_attribute('title')
                            if image_title:
                                row_data.append(image_title)  # Add the image title if present
                            else:
                                # If no title, extract from the image URL
                                image_url = img_element.get_attribute('src')
                                image_code = image_url.split('/')[-1].split('.')[0]  # Extract "8p" from URL
                                row_data.append(image_code)  # Add the extracted part from the URL
                        else:
                            row_data.append("")  # Leave it blank if no image is found
                    else:
                        # If the cell contains text, just add it
                        row_data.append(cell_text)

                if len(row_data) >= 1:
                    table_data.append([event_id, episode] + row_data[:col_limit if col_limit else len(row_data)])
        except NoSuchElementException:
            print(f"Table with XPath '{xpath}' not found.")
    return table_data


# Main function
def wildfire_csv(html_file):
    # File and event details
    file_name = html_file.split('/')[-1].split('.')[0]
    event_id, episode, event_type = file_name.split('_')[0:3]
    print(event_id, episode, event_type)

    # Initialize the WebDriver
    driver = webdriver.Chrome(options=chrome_options)
    driver.get(html_file)


    # Define table configurations (XPaths and column limits)
    table_configs = [
        ('//*[@id="alert_summary_left"]/table/tbody', 8),
        ('//*[@id="ctl00_CPH_GridViewEpisodes"]/tbody', 10),
        ('//*[@id="graph_eq"]/table/tbody/tr/td/table/tbody', 0),
        ('//*[@id="ctl00_CPH_GridShakemaps"]/tbody', 10),
        ('//*[@id="provinces"]/table/tbody', 4),
        ('//*[@id="cities"]/table/tbody', 7),
        ('//*[@id="airports"]/table/tbody', 9),
        ('//*[@id="ports"]/table/tbody', 5),
        ('//*[@id="dams"]/table/tbody', 6),
        ('//*[@id="nuclear"]/table/tbody', 5)
    ]

    # Process each table
    tables_data = []
    for i, (xpath, col_limit) in enumerate(table_configs):
        table_data = extract_table_data(driver, event_id, episode, xpath, col_limit)
        tables_data.append(table_data)
    for row in tables_data:
        print(row)
    driver.quit()

    # Define CSV destination
    destination_folder = 'D:/Web Scraping/Web-Scraping/CSV/test/'

    # Generate CSV files
    for i, table_data in enumerate(tables_data):
        csv_file_path = f'{destination_folder}{i}.csv'
        file_exists = os.path.exists(csv_file_path)
        header = eval(f"table_{i+1}_headers")
        print(i,header)
        print(table_data)
        df = pd.DataFrame(table_data,columns=header)
        df.head()
        df.to_csv(csv_file_path, mode='a' if file_exists else 'w', header=not file_exists, index=False)
        print(f"Data {'appended to' if file_exists else 'written to'} {csv_file_path}")


In [19]:
def extract_table_data(driver, event_id, episode, xpath, col_limit):
    table_data = []
    if xpath == '//*[@id="alert_summary_left"]/table/tbody':
        try:
            temp_tbl=[]
            table = driver.find_element(By.XPATH, xpath)
            rows = table.find_elements(By.XPATH, './/tr')
            event_summary_text = driver.find_element(By.XPATH, '//*[@class="p_summary"][1]').text
            for row in rows:
                columns = row.find_elements(By.XPATH, './/td')
                row_data = [column.text for column in columns]
                if len(row_data)>=1:
                    temp_tbl.append(row_data)
            table_data = [[event_id, episode,*[temp_tbl[i][1] for i in range(1,col_limit)]
                        ,event_summary_text]]
        except NoSuchElementException:
            print(f"Event summary not founds or table with XPath '{xpath}' not found.")
        return table_data
    else:
        try:
            table = driver.find_element(By.XPATH, xpath)
            rows = table.find_elements(By.XPATH, './/tr')
            for row in rows:
                columns = row.find_elements(By.XPATH, './/td' if col_limit else './/th | .//td')
                row_data = []
                for col in columns:
                    cell_text = col.text.strip()

                    # If the cell is empty, look for an image
                    if not cell_text:
                        img_element = col.find_element(By.TAG_NAME, 'img') if col.find_elements(By.TAG_NAME, 'img') else None
                        if img_element:
                            # Try to get the title attribute
                            image_title = img_element.get_attribute('title')
                            if image_title:
                                row_data.append(image_title)  # Add the image title if present
                            else:
                                # If no title, extract from the image URL
                                image_url = img_element.get_attribute('src')
                                image_code = image_url.split('/')[-1].split('.')[0]  # Extract "8p" from URL
                                row_data.append(image_code)  # Add the extracted part from the URL
                        else:
                            row_data.append("")  # Leave it blank if no image is found
                    else:
                        # If the cell contains text, just add it
                        row_data.append(cell_text)

                if len(row_data) >= 1:
                    table_data.append([event_id, episode] + row_data[:col_limit if col_limit else len(row_data)])
        except NoSuchElementException:
            print(f"Table with XPath '{xpath}' not found.")
    return table_data

In [22]:
def list_filenames(folder_path):
    try:
        # Get a list of all files in the specified folder
        filenames = os.listdir(folder_path)
        
        return filenames
    
    except FileNotFoundError:
        print(f"Folder not found: {folder_path}")
    except Exception as e:
        print(f"An error occurred: {e}")

# Example usage:
folder_path = "F:/Web Scraping/latest_htmls_old/Earthquakes"  # Replace with the actual folder path
filenames = list_filenames(folder_path)

for file in filenames:
    wildfire_csv(f"file:///F:/Web Scraping/latest_htmls_old/Earthquakes/{file}")

1446590 1594407 Earthquakes
[['1446590', '1594407', '17.925 Km', '37.1736 , 37.032', '06 Feb 2023 01:17 UTC\n06 Feb 2023 04:17 Local', 'Less than 100 people in MMI X\n9.2 million (in MMI>=VII)', 'NEIC us6000jllz', '06 Feb 2023 01:40 UTC', '3.2 (Türkiye)', 'This earthquake is expected to have a high humanitarian impact based on the magnitude and the affected population and their vulnerability.']]
[['1446590', '1594407', '1487094', 'EQ', '4', '06 Feb 2023 01:17', '7.8M, 24.075km', '8.3', '3.7 million (in MMI>=VII)', '', '00:12', 'NEIC'], ['1446590', '1594407', '1487096', 'EQ', '4.8', '06 Feb 2023 01:17', '7.8M, 17.925km', '9.56', '8.4 million (in MMI>=VII)', '', '00:22', 'NEIC']]
[['1446590', '1594407', 'Intensity', 'Population'], ['1446590', '1594407', '10p', '<1000 people'], ['1446590', '1594407', '9p', '640000 people'], ['1446590', '1594407', '8p', '1 million people'], ['1446590', '1594407', '7p', '6.7 million people'], ['1446590', '1594407', '6p', '12.4 million people'], ['1446590', 