In [1]:
# Libraries
from pathlib import Path
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import os

In [2]:
# Set up Chrome options
chrome_options = Options()
chrome_options.add_argument("–disable-extensions")

In [None]:
# Earthquake Column headers]
table_1_headers = ['Event_id','Episode', 'Countries', 'Start_date', 'Duration', 'Impact','Event_summary']
table_2_headers = ['Event_id','Episode', 'Id', 'Alert', 'Score', 'Date', 'Description_impact', 'Source']
table_3_headers = ['Event_id','Episode', 'Country']
table_4_headers = ['Event_id','Episode', 'Agri', 'Name', 'Country']


# Main function
def create_csv(html_file):
    # File and event details
    file_name = html_file.split('/')[-1].split('.')[0]
    event_id, episode, event_type = file_name.split('_')[0:3]
    print(event_id, episode, event_type)

    # Initialize the WebDriver
    driver = webdriver.Chrome(options=chrome_options)
    driver.get(html_file)


    # Define table configurations (XPaths and column limits)
    table_configs = [
        ('//*[@id="alert_summary_left"]/table/tbody', 5),
        ('//*[@id="ctl00_CPH_GridViewEpisodes"]/tbody', 6),
        ('//*[@id="countries"]/table/tbody', 1),
        ('//*[@id="aru"]/table/tbody', 3)
    ]

    # Process each table
    tables_data = []
    for i, (xpath, col_limit) in enumerate(table_configs):
        table_data = extract_table_data(driver, event_id, episode, xpath, col_limit)
        tables_data.append(table_data)
    for row in tables_data:
        print(row)
    driver.quit()

    # Define CSV destination
    destination_folder = 'CSV'
    absolute_folder_path = Path(destination_folder).resolve()

    # Generate CSV files
    for i, table_data in enumerate(tables_data,start=1):
        csv_file_path = f'{absolute_folder_path}\{event_type}_{i}.csv'
        file_exists = os.path.exists(csv_file_path)
        header = eval(f"table_{i}_headers")
        print(i,header)
        print(table_data)
        df = pd.DataFrame(table_data,columns=header)
        df.head()
        df.to_csv(csv_file_path, mode='a' if file_exists else 'w', header=not file_exists, index=False)
        print(f"Data {'appended to' if file_exists else 'written to'} {csv_file_path}")


In [4]:

# Function to extract table data based on XPath and the number of columns required
def extract_table_data(driver, event_id, episode, xpath, col_limit):
    table_data = []
    if xpath == '//*[@id="alert_summary_left"]/table/tbody' or xpath == '//*[@id="tab_responsive"]/table/tbody' :
        try:
            temp_tbl=[]
            table = driver.find_element(By.XPATH, xpath)
            rows = table.find_elements(By.XPATH, './/tr')
            event_summary_text = driver.find_element(By.XPATH, '//*[@class="p_summary"][1]').text
            for row in rows:
                columns = row.find_elements(By.XPATH, './/td')
                row_data = [column.text for column in columns]
                if len(row_data)>=1:
                    temp_tbl.append(row_data)
            table_data = [[event_id, episode,*[temp_tbl[i][1] for i in range(1,col_limit)]
                        ,event_summary_text]]
        except NoSuchElementException:
            print(f"Event summary not founds or table with XPath '{xpath}' not found.")
        return table_data
    else:
        try:
            table = driver.find_element(By.XPATH, xpath)
            rows = table.find_elements(By.XPATH, './/tr')
            for row in rows:
                columns = row.find_elements(By.XPATH, './/td' if col_limit else './/th | .//td')
                row_data = []
                for col in columns:
                    cell_text = col.text.strip()

                    # If the cell is empty, look for an image
                    if not cell_text:
                        img_element = col.find_element(By.TAG_NAME, 'img') if col.find_elements(By.TAG_NAME, 'img') else None
                        if img_element:
                            # Try to get the title attribute
                            image_title = img_element.get_attribute('title')
                            if image_title:
                                row_data.append(image_title)  # Add the image title if present
                            else:
                                # Try to get the alt attribute if title is absent
                                alt_text = img_element.get_attribute('alt')
                                if alt_text:
                                    row_data.append(alt_text)  # Add the alt text if present
                                else:
                                    # If no title or alt, extract from the image URL
                                    image_url = img_element.get_attribute('src')
                                    image_code = image_url.split('/')[-1].split('.')[0]  # Extract "8p" from URL
                                    row_data.append(image_code)  # Add the extracted part from the URL
                        else:
                            row_data.append("")  # Leave it blank if no image is founD
                    else:
                        # If the cell contains text, just add it
                        row_data.append(cell_text)

                if len(row_data) >= 1:
                    table_data.append([event_id, episode] + row_data[:col_limit if col_limit else len(row_data)])
        except NoSuchElementException:
            print(f"Table with XPath '{xpath}' not found.")
    return table_data

In [5]:
def list_filenames(folder_path):
    try:
        # Get a list of all files in the specified folder
        filenames = os.listdir(folder_path)
        
        return filenames
    
    except FileNotFoundError:
        print(f"Folder not found: {folder_path}")
    except Exception as e:
        print(f"An error occurred: {e}")

def main():
    folder_path = "html_files/Droughts"
    absolute_folder_path = Path(folder_path).resolve()
    filenames = list_filenames(absolute_folder_path)

    for file in filenames:
        absolute_file_path = absolute_folder_path / file
        create_csv(f"file:///{absolute_file_path.as_posix()}")


In [6]:
if __name__ == "__main__":
    main()

1013868 152 Droughts
[['1013868', '152', 'Argentina', 'End of May 2024', '71 days', 'Minor impact for agricultural drought in 73617 km2', 'No evidence of relevant impacts or not enough information available to date.']]
[['1013868', '152', '1', 'Medium', '1', '06 Apr 2020 15:07', 'Medium impact for agricultural drought in 78691 km2', 'GDO'], ['1013868', '152', '2', 'Medium', '1', '13 Apr 2020 13:27', 'Medium impact for agricultural drought in 511626 km2', 'GDO'], ['1013868', '152', '3', 'Medium', '1', '23 Apr 2020 10:47', 'Medium impact for agricultural drought in 511626 km2', 'GDO'], ['1013868', '152', '4', 'Medium', '1', '10 May 2020 15:47', 'Medium impact for agricultural drought in 509307 km2', 'GDO'], ['1013868', '152', '5', 'Medium', '1', '11 May 2020 14:47', 'Medium impact for agricultural drought in 954036 km2', 'GDO'], ['1013868', '152', '6', 'Medium', '1', '02 Jun 2020 16:37', 'Medium impact for agricultural drought in 1045260 km2', 'GDO'], ['1013868', '152', '7', 'Medium', '1