In [57]:
# Libraries
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import os

In [58]:
# Set up Chrome options
chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("â€“disable-extensions")

In [63]:
# Wildfire Column headers
table_0_headers = ['Event_id','Episode','Event_type','Impact_url']
table_1_headers = ['Event_id','Episode','Countries','Start_date_last_detected','Duration','People_affected','Burned_area','Event_summary']
table_2_headers = ['Event_id','Episode','ID','Alert_Color','GDACS_Score','Population_Affected','Burned_Area','Last_Update','GWIS']
table_3_headers = ['Event_id','Episode','Radius','Population']
table_4_headers = ['Event_id','Episode','Region_province','Country','Population']
table_5_headers = ['Event_id','Episode','Name','Region_Province','Country','City_class','Population','Distance']
table_6_headers = ['Event_id','Episode','Name','IATA_Code','Elevation_in_m','Usage','Runway_type','IFR','Runway_Length_in_ft','Distance']
table_7_headers = ['Event_id','Episode','Name','LOCODE','Country','Distance']
table_8_headers = ['Event_id','Episode','Reservoir','Dam_Name','River','Year','Distance']
table_9_headers = ['Event_id','Episode','Name','Country','Reactor','Distance']
# Function to extract table data based on XPath and the number of columns required
def extract_table_data(driver, event_id, episode, xpath, col_limit):
    table_data = []
    if xpath == '//*[@id="alert_summary_left"]/table/tbody':
        try:
            temp_tbl=[]
            table = driver.find_element(By.XPATH, xpath)
            rows = table.find_elements(By.XPATH, './/tr')
            event_summary_text = driver.find_element(By.XPATH, '//*[@class="p_summary"][1]').text
            for row in rows:
                columns = row.find_elements(By.XPATH, './/td')
                row_data = [column.text for column in columns]
                if len(row_data)>=1:
                    temp_tbl.append(row_data)
            table_data = [[event_id, episode,*[temp_tbl[i][1] for i in range(1,col_limit)]
                        ,event_summary_text]]
        except NoSuchElementException:
            print(f"Event summary not founds or table with XPath '{xpath}' not found.")
        return table_data
    else:
        try:
            table = driver.find_element(By.XPATH, xpath)
            rows = table.find_elements(By.XPATH, './/tr')
            for row in rows:
                columns = row.find_elements(By.XPATH, './/td' if col_limit else './/th | .//td')
                row_data = [col.text.strip() for col in columns]
                if len(row_data) >= 1:
                    table_data.append([event_id, episode] + row_data[:col_limit if col_limit else len(row_data)])
        except NoSuchElementException:
            print(f"Table with XPath '{xpath}' not found.")
        return table_data

# Main function
def wildfire_csv(html_file):
    # File and event details
    file_name = html_file.split('/')[-1].split('.')[0]
    event_id, episode, event_type = file_name.split('_')[0:3]
    print(event_id, episode, event_type)

    # Initialize the WebDriver
    driver = webdriver.Chrome(options=chrome_options)
    driver.get(html_file)


    # Define table configurations (XPaths and column limits)
    table_configs = [
        ('//*[@id="alert_summary_left"]/table/tbody', 6),
        ('//*[@id="ctl00_CPH_GridViewEpisodes"]/tbody', 7),
        ('//*[@id="graph_eq"]/table/tbody/tr/td/table/tbody', 0),
        ('//*[@id="provinces"]/table/tbody', 3),
        ('//*[@id="cities"]/table/tbody', 6),
        ('//*[@id="airports"]/table/tbody', 8),
        ('//*[@id="ports"]/table/tbody', 4),
        ('//*[@id="dams"]/table/tbody', 5),
        ('//*[@id="nuclear"]/table/tbody', 4)
    ]

    # Process each table
    tables_data = []
    for i, (xpath, col_limit) in enumerate(table_configs):
        table_data = extract_table_data(driver, event_id, episode, xpath, col_limit)
        tables_data.append(table_data)
    for row in tables_data:
        print(row)
    driver.quit()

    # Define CSV destination
    destination_folder = 'D:/Web Scraping/Web-Scraping/CSV/test/'

    # Generate CSV files
    for i, table_data in enumerate(tables_data):
        csv_file_path = f'{destination_folder}{i}.csv'
        file_exists = os.path.exists(csv_file_path)
        header = eval(f"table_{i+1}_headers")
        print(i,header)
        print(table_data)
        df = pd.DataFrame(table_data,columns=header)
        df.head()
        df.to_csv(csv_file_path, mode='a' if file_exists else 'w', header=not file_exists, index=False)
        print(f"Data {'appended to' if file_exists else 'written to'} {csv_file_path}")


In [64]:
def list_filenames(folder_path):
    try:
        # Get a list of all files in the specified folder
        filenames = os.listdir(folder_path)
        
        return filenames
    
    except FileNotFoundError:
        print(f"Folder not found: {folder_path}")
    except Exception as e:
        print(f"An error occurred: {e}")

# Example usage:
folder_path = "F:/Web Scraping/latest_htmls/Wildfires/"  # Replace with the actual folder path
filenames = list_filenames(folder_path)

for file in filenames:
    wildfire_csv(f"file:///F:/Web Scraping/latest_htmls/Wildfires/{file}")

1004452 10 Wildfires
[['1004452', '10', 'Republic of Korea', '04 Mar 2022 - 12 Mar 2022', '8', '1580 in the burned area', '19748 ha', 'This forest fire is expected to have a low humanitarian impact based on the magnitude and the affected population and their vulnerability.']]
[['1004452', '10', '1', '', '0.5', '1044', '7992', '05 Mar 2022 04:01', 'GWIS'], ['1004452', '10', '2', '', '0.5', '1044', '8053', '05 Mar 2022 11:37', 'GWIS'], ['1004452', '10', '3', '', '0.5', '1539', '12682', '06 Mar 2022 04:01', 'GWIS'], ['1004452', '10', '4', '', '0.5', '1570', '15222', '07 Mar 2022 04:01', 'GWIS'], ['1004452', '10', '5', '', '0.5', '1580', '17363', '08 Mar 2022 04:01', 'GWIS'], ['1004452', '10', '6', '', '0.5', '1570', '18099', '09 Mar 2022 04:01', 'GWIS'], ['1004452', '10', '7', '', '0.5', '1580', '18435', '10 Mar 2022 04:00', 'GWIS'], ['1004452', '10', '8', '', '0.5', '1580', '19387', '11 Mar 2022 04:00', 'GWIS'], ['1004452', '10', '9', '', '0.5', '1633', '19728', '12 Mar 2022 04:00', 'GWI