In [23]:
# Libraries
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import os

In [24]:
# Set up Chrome options
chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--no-sandbox")

In [25]:
# Wildfire Column headers
table_0_headers = ['Event_id','Episode','Event_type','Impact_url']
table_1_headers = ['Event_id','Episode','Countries','Start_date_last_detected','Duration','People_affected','Burned_area','Event_summary']
table_2_headers = ['Event_id','Episode','ID','Alert_Color','GDACS_Score','Population_Affected','Burned_Area','Last_Update','GWIS']
table_3_headers = ['Event_id','Episode','Radius','Population']
table_4_headers = ['Event_id','Episode','Region_province','Country','Population']
table_5_headers = ['Event_id','Episode','Name','Region_Province','Country','City_class','Population','Distance']
table_6_headers = ['Event_id','Episode','Name','IATA_Code','Elevation_in_m','Usage','Runway_type','IFR','Runway_Length_in_ft','Distance']
table_7_headers = ['Event_id','Episode','Name','LOCODE','Country','Distance']
table_8_headers = ['Event_id','Episode','Reservoir','Dam_Name','River','Year','Distance']
table_9_headers = ['Event_id','Episode','Name','Country','Reactor','Distance']


In [26]:
def wildfire_csv(html_file):
    file_name = html_file.split('/')[-1].split('.')[0]
    event_id,episode,event_type = file_name.split('_')[0:3]

    print(event_id,episode,event_type)



    # Initialize the WebDriver
    driver = webdriver.Chrome(options=chrome_options)
    driver.get(html_file)
        
    try:
        event_path = '//*[@class="p_summary"][1]'
        event_summary_element = driver.find_element(By.XPATH, event_path)
        event_summary_text = event_summary_element.text
    except NoSuchElementException:
        print(f"Table with the provided XPath '{event_path}' not found.")

    table_1_df =[]
    try:
        table_xpath = '//*[@id="alert_summary_left"]/table/tbody'  # Adjust XPath as needed
        table = driver.find_element(By.XPATH, table_xpath)
        # Fetch table rows using XPath
        rows_xpath = './/tr'  # XPath to locate rows within the table
        rows = table.find_elements(By.XPATH, rows_xpath)
        # Extract data from each row and column
        table_data = []
        for row in rows:
            columns_xpath = './/td'  # XPath to locate columns within each row
            columns = row.find_elements(By.XPATH, columns_xpath)
            row_data = [column.text for column in columns]
            if len(row_data) <1:
                continue
            else:
                table_data.append(row_data)


        row = [
            event_id,
            episode,
            *[table_data[i][1] for i in range(1, 6)],
            event_summary_text
        ]
        table_1_df.append(row)
        # for row in table_1_df:
        #     print(row)
    except NoSuchElementException:
        print(f"Table with the provided XPath '{table_xpath}' not found.")



    # table 2
    table_2_df = []
    try:
        # Locate the Impact Timeline
        table_xpath = '//*[@id="ctl00_CPH_GridViewEpisodes"]/tbody'  # Adjust XPath as needed
        table = driver.find_element(By.XPATH, table_xpath)
        # Fetch table rows using XPath
        rows_xpath = './/tr'  # XPath to locate rows within the table
        rows = table.find_elements(By.XPATH, rows_xpath)
        # Extract data from each row and column
        table_data = []
        for row in rows:
            columns_xpath = './/td'  # XPath to locate columns within each row
            columns = row.find_elements(By.XPATH, columns_xpath)
            row_data = [column.text for column in columns]
            if len(row_data) <1:
                continue
            else:
                table_data.append(row_data)
        for i in range(len(table_data)):
            row = [
                event_id,
                episode,
                *table_data[i][:7] 
            ]
            table_2_df.append(row)

        # for row in table_2_df:
        #     print(row)
    except NoSuchElementException:
        print(f"Table with the provided XPath '{table_xpath}' not found.")





    table_3_df = []
    try:
        # Locate the Exposed population
        table_xpath = '//*[@id="graph_eq"]/table/tbody/tr/td/table/tbody'  # Adjust XPath as needed
        table = driver.find_element(By.XPATH, table_xpath)
        # Fetch table rows using XPath
        rows_xpath = './/tr'  # XPath to locate rows within the table
        rows = table.find_elements(By.XPATH, rows_xpath)
        # Extract data from each row and column
        table_data = []
        for row in rows:
            columns_xpath = './/th | .//td'  # XPath to locate columns within each row
            columns = row.find_elements(By.XPATH, columns_xpath)
            row_data = [column.text for column in columns]
            if len(row_data) <1:
                continue
            else:
                table_data.append(row_data)

        for i in range(len(table_data)):
            row = [
                event_id,
                episode,
                *table_data[i][:2] 
            ]
            table_3_df.append(row)

        # for row in table_3_df:
        #     print(row)
    except NoSuchElementException:
        print(f"Table with the provided XPath '{table_xpath}' not found.")



    # Table 4
    table_4_df = []
    try:
        # Locate the Affected Provinces
        table_xpath = '//*[@id="provinces"]/table/tbody'  # Adjust XPath as needed
        table = driver.find_element(By.XPATH, table_xpath)
        # Fetch table rows using XPath
        rows_xpath = './/tr'  # XPath to locate rows within the table
        rows = table.find_elements(By.XPATH, rows_xpath)
        # Extract data from each row and column
        table_data = []
        for row in rows:
            columns_xpath = './/td'  # XPath to locate columns within each row
            columns = row.find_elements(By.XPATH, columns_xpath)
            row_data = [column.text for column in columns]
            if len(row_data) <1:
                continue
            else:
                table_data.append(row_data)

            
        for i in range(len(table_data)):
            row = [
                event_id,
                episode,
                *table_data[i][:3] 
            ]
            table_4_df.append(row)

        # for row in table_4_df:
        #     print(row)
    except NoSuchElementException:
        print(f"Table with the provided XPath '{table_xpath}' not found.")

    table_5_df = []
    try:
        # Locate the Affected populated places
        table_xpath = '//*[@id="cities"]/table/tbody'  # Adjust XPath as needed
        table = driver.find_element(By.XPATH, table_xpath)
        # Fetch table rows using XPath
        rows_xpath = './/tr'  # XPath to locate rows within the table
        rows = table.find_elements(By.XPATH, rows_xpath)
        # Extract data from each row and column
        table_data = []
        for row in rows:
            columns_xpath = './/td'  # XPath to locate columns within each row
            columns = row.find_elements(By.XPATH, columns_xpath)
            row_data = [column.text.strip() for column in columns]
            # row_data = [column.text for column in columns]
            if len(row_data) <1:
                continue
            else:
                table_data.append(row_data)
            
            
        for i in range(len(table_data)):
            row = [
                event_id,
                episode,
                *table_data[i][:6] 
            ]
            table_5_df.append(row)

        # for row in table_5_df:
        #     print(row)
    except NoSuchElementException:
        print(f"Table with the provided XPath '{table_xpath}' not found.")



    table_6_df = []
    try:
        # Table 6
        # Locate the airports
        table_xpath = '//*[@id="airports"]/table/tbody'  # Adjust XPath as needed
        table = driver.find_element(By.XPATH, table_xpath)
        # Fetch table rows using XPath
        rows_xpath = './/tr'  # XPath to locate rows within the table
        rows = table.find_elements(By.XPATH, rows_xpath)
        # Extract data from each row and column
        table_data = []
        for row in rows:
            columns_xpath = './/td'  # XPath to locate columns within each row
            columns = row.find_elements(By.XPATH, columns_xpath)
            row_data = [column.text.strip() for column in columns]
            # row_data = [column.text for column in columns]
            if len(row_data) <1:
                continue
            else:
                table_data.append(row_data)
            
            
        for i in range(len(table_data)):
            row = [
                event_id,
                episode,
                *table_data[i][:8] 
            ]
            table_6_df.append(row)

        # for row in table_6_df:
        #     print(row)
    except NoSuchElementException:
        print(f"Table with the provided XPath '{table_xpath}' not found.")




    # Table 7
    # Locate the ports
    table_7_df = []
    try:
        table_xpath = '//*[@id="ports"]/table/tbody'  # Adjust XPath as needed
        table = driver.find_element(By.XPATH, table_xpath)
        # Fetch table rows using XPath
        rows_xpath = './/tr'  # XPath to locate rows within the table
        rows = table.find_elements(By.XPATH, rows_xpath)
        # Extract data from each row and column
        table_data = []
        for row in rows:
            columns_xpath = './/td'  # XPath to locate columns within each row
            columns = row.find_elements(By.XPATH, columns_xpath)
            row_data = [column.text.strip() for column in columns]
            # row_data = [column.text for column in columns]
            if len(row_data) <1:
                continue
            else:
                table_data.append(row_data)
            
            
        for i in range(len(table_data)):
            row = [
                event_id,
                episode,
                *table_data[i][:4] 
            ]
            table_7_df.append(row)

        # for row in table_7_df:
        #     print(row)
            
    except NoSuchElementException:
        print(f"Table with the provided XPath '{table_xpath}' not found.")
        

    table_8_df = []
    # Table 8
    # Locate the dams
    try:
        table_xpath = '//*[@id="dams"]/table/tbody'  # Adjust XPath as needed
        table = driver.find_element(By.XPATH, table_xpath)
        # Fetch table rows using XPath
        rows_xpath = './/tr'  # XPath to locate rows within the table
        rows = table.find_elements(By.XPATH, rows_xpath)
        # Extract data from each row and column
        table_data = []
        for row in rows:
            columns_xpath = './/td'  # XPath to locate columns within each row
            columns = row.find_elements(By.XPATH, columns_xpath)
            row_data = [column.text.strip() for column in columns]
            # row_data = [column.text for column in columns]
            if len(row_data) <1:
                continue
            else:
                table_data.append(row_data)
            
        for i in range(len(table_data)):
            row = [
                event_id,
                episode,
                *table_data[i][:5] 
            ]
            table_8_df.append(row)

        # for row in table_8_df:
        #     print(row)
            
    except NoSuchElementException:
        print(f"Table with the provided XPath '{table_xpath}' not found.")
        
        
    table_9_df = []
    # Table 9
    # Locate the nuclear plants
    try:
        table_xpath = '//*[@id="nuclear"]/table/tbody'  # Adjust XPath as needed
        table = driver.find_element(By.XPATH, table_xpath)
        # Fetch table rows using XPath
        rows_xpath = './/tr'  # XPath to locate rows within the table
        rows = table.find_elements(By.XPATH, rows_xpath)
        # Extract data from each row and column
        table_data = []
        for row in rows:
            columns_xpath = './/td'  # XPath to locate columns within each row
            columns = row.find_elements(By.XPATH, columns_xpath)
            row_data = [column.text.strip() for column in columns]
            # row_data = [column.text for column in columns]
            if len(row_data) <1:
                continue
            else:
                table_data.append(row_data)
            
        for i in range(len(table_data)):
            row = [
                event_id,
                episode,
                *table_data[i][:5] 
            ]
            table_9_df.append(row)

        # for row in table_9_df:
        #     print(row)
            
    except NoSuchElementException:
        print(f"Table with the provided XPath '{table_xpath}' not found.")

    driver.quit()

    # Generate CSV Files
    destination_folder = 'D:\Web Scraping\Web-Scraping\CSV\Wildfires\\'
    for i in range(1,10):
        header = eval(f"table_{i}_headers")
        tbl = eval(f'table_{i}_df')
        df = pd.DataFrame(tbl,columns=header)
        # print(df.head())
        csv_file_path = destination_folder+f'{i}.csv'
        # Check if the file exists
        file_exists = os.path.exists(csv_file_path)
            # Append to the file
        
        # Write or append to the CSV file
        # df.to_csv(csv_file_path, mode= 'w', header= True, index=False)
        df.to_csv(csv_file_path, mode='a' if file_exists else 'w', header=not file_exists, index=False)
        print(f"Data {'appended to' if file_exists else 'written to'} {csv_file_path}")
        

  destination_folder = 'D:\Web Scraping\Web-Scraping\CSV\Wildfires\\'


In [27]:
def list_filenames(folder_path):
    try:
        # Get a list of all files in the specified folder
        filenames = os.listdir(folder_path)
        
        return filenames
    
    except FileNotFoundError:
        print(f"Folder not found: {folder_path}")
    except Exception as e:
        print(f"An error occurred: {e}")

# Example usage:
folder_path = "F:/Web Scraping/latest_htmls/Wildfires/"  # Replace with the actual folder path
filenames = list_filenames(folder_path)

for file in filenames:
    wildfire_csv(f"file:///F:/Web Scraping/latest_htmls/Wildfires/{file}")

1022234 9 Wildfires
Table with the provided XPath '//*[@id="cities"]/table/tbody' not found.
Table with the provided XPath '//*[@id="airports"]/table/tbody' not found.
Table with the provided XPath '//*[@id="ports"]/table/tbody' not found.
Table with the provided XPath '//*[@id="dams"]/table/tbody' not found.
Table with the provided XPath '//*[@id="nuclear"]/table/tbody' not found.
Data written to D:\Web Scraping\Web-Scraping\CSV\Wildfires\1.csv
Data written to D:\Web Scraping\Web-Scraping\CSV\Wildfires\2.csv
Data written to D:\Web Scraping\Web-Scraping\CSV\Wildfires\3.csv
Data written to D:\Web Scraping\Web-Scraping\CSV\Wildfires\4.csv
Data written to D:\Web Scraping\Web-Scraping\CSV\Wildfires\5.csv
Data written to D:\Web Scraping\Web-Scraping\CSV\Wildfires\6.csv
Data written to D:\Web Scraping\Web-Scraping\CSV\Wildfires\7.csv
Data written to D:\Web Scraping\Web-Scraping\CSV\Wildfires\8.csv
Data written to D:\Web Scraping\Web-Scraping\CSV\Wildfires\9.csv
1022268 10 Wildfires
Table wit