In [1]:
import time
import requests
import numpy as np
import pandas as pd
from lxml import html
from bs4 import BeautifulSoup
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager


In [58]:
ESWD_URL = 'https://eswd.eu/cgi-bin/eswd.cgi'
ESWD_TORNADO_XPATH = '//*[@name="TORNADO"]'
# YEARS_TO_COUNT = ['1994', '1995', '1996', '1997', '1998', '1999', '2000', '2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017']
YEARS_TO_COUNT = ['2015']
# MONTHS_TO_COUNT = ['01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12']
MONTHS_TO_COUNT = ['03']
START_DATE_XPATH = '//*[@id="start_date"]'
END_DATE_XPATH = '//*[@id="end_date"]'
FIND_REPORTS_COUNT_XPATH = "//p[contains(text(),'selected reports')] | //p[contains(text(),'no reports')]"
SUBMIT_XPATH = '(//*[@value="submit query"])[2]'

def create_web_driver():
    return webdriver.Chrome(ChromeDriverManager().install())

def start_web_driver(driver):
    driver.maximize_window()
    driver.get(ESWD_URL)
    driver.find_element_by_xpath(ESWD_TORNADO_XPATH).click()

def get_date(curr_year, curr_month):
    if curr_month == '02':
        day = '28'
    elif curr_month == '04' or curr_month == '06' or curr_month == '09' or curr_month == '11':
        day = '30'
    else:
        day = '31'
    return f'{day}-{curr_month}-{curr_year}', f'01-{curr_month}-{curr_year}'

def input_date(driver, start_date, end_date):
    driver.find_element_by_xpath(START_DATE_XPATH).clear()
    time.sleep(0.5)
    driver.find_element_by_xpath(START_DATE_XPATH).send_keys(start_date)
    driver.find_element_by_xpath(END_DATE_XPATH).clear()
    time.sleep(0.5)
    driver.find_element_by_xpath(END_DATE_XPATH).send_keys(end_date)

def close_web_driver(driver):
    driver.close()

def get_data_from_page(driver):
    district, city, country, coordinates, date, times, scale = [], [], [], [], [], [], []
    base_xpath ="//td[@class='base_info']/p[b]"
    district_xpath = "/b[1]"
    # city_xpath = "/text()[2]"
    city_xpath = "/b"
    country_xpath = "/b[2]"
    coordinates_xpath = "/text()[3]"
    date_xpath = "/b[3]"
    time_xpath = "/b[4]"
    scale_xpath_1 = "(//p[@class='TORNADO detail_info_entry'])"
    scale_xpath_2 = "/b[contains(text(),'F')]" 
    scale_xpath_3 = "((//p[@class='TORNADO'][contains(b,'tornado')])"
    scale_xpath_4 = "/b[text()='tornado'])"
    reports_count = len(driver.find_elements_by_xpath(base_xpath))
    for i in range(reports_count):
        district = get_elements_from_list(driver,district_xpath,base_xpath,district,i)
        child_item = driver.find_elements_by_xpath((f'({base_xpath})[{i+1}]{city_xpath}'))
        child_item_text = get_text_from_element_list(child_item)
        parent_item_string= driver.find_elements_by_xpath(base_xpath)[i].text
        city= replace_child_text(child_item_text,parent_item_string,city,0)
        country = get_elements_from_list(driver,country_xpath,base_xpath,country,i)
        coordinates = replace_child_text(child_item_text,parent_item_string,coordinates,1)
        date = get_elements_from_list(driver,date_xpath,base_xpath,date,i)
        times = get_elements_from_list(driver,time_xpath,base_xpath,times,i)
        scale_elements_list = driver.find_elements_by_xpath((f'{scale_xpath_1}[{i+1}]{scale_xpath_2} | {scale_xpath_3}[{i+1}]{scale_xpath_4}'))
        scale = get_scale_list(scale,scale_elements_list)
        # if len(scale_elements_list) == 1:
        #     scale.append('NULL')
        # else:
        #     for element in scale_elements_list:
        #         if 'F' in element.text:
        #             scale.append(element.text)
    print(city)
    print(coordinates)
    print(scale)

def get_scale_list(scale,scale_elements_list):
    if len(scale_elements_list) == 1:
        scale.append('NULL')
    else:
        for element in scale_elements_list:
            if 'F' in element.text:
                scale.append(element.text)
    return scale

def replace_child_text(child_text,field,curr_list,selector):
    for word in child_text.split(' '):
        field = field.replace('\n','').replace(word,'',1).lstrip()
    if field[0] == '(':
        if selector == 0:
            curr_list.append(field.split('(')[selector+1].rstrip().replace(')',''))
        elif selector == 1:
            if '<' in field:
                curr_list.append(field.split('(')[selector+1].split(')')[0].rstrip())
            else:
                curr_list.append(field.split('(')[selector+1].rstrip().replace(')',''))
    else:
        if selector == 0:
            curr_list.append(field.split('(')[selector].rstrip())
        elif selector == 1:
            if '<' in field:
                curr_list.append(field.split('(')[selector].split(')')[0].rstrip())
            else:
                curr_list.append(field.split('(')[selector].rstrip().replace(')',''))
    return curr_list

def get_text_from_element_list(elem_list):
    text = ''
    for elem in elem_list:
        text += elem.text + ' '
    return text

def get_elements_from_list(driver,curr_xpath,base_xpath,curr_list,i):
    curr_list.append(driver.find_elements_by_xpath((base_xpath+curr_xpath))[i].text)
    return curr_list

def log(message):
    with open('log.txt', 'a') as f:
        f.write(message + '\n')

driver = create_web_driver()
start_web_driver(driver)
for year in YEARS_TO_COUNT:
    for month in MONTHS_TO_COUNT:
        end_date, start_date = get_date(year, month)
        input_date(driver, start_date, end_date)
        driver.find_element_by_xpath(SUBMIT_XPATH).click()
        reports_amount = driver.find_element_by_xpath(FIND_REPORTS_COUNT_XPATH).text
        log(f'{year}-{month}-{reports_amount}')
        if "no report" in reports_amount:
            continue
        time.sleep(2)
        get_data_from_page(driver)
        # df.to_csv(f'{year}_{month}.csv')
# time.sleep(2)
# close_web_driver(driver)




Current google-chrome version is 96.0.4664
Get LATEST chromedriver version for 96.0.4664 google-chrome
Driver [/Users/elad/.wdm/drivers/chromedriver/mac64/96.0.4664.45/chromedriver] found in cache
  return webdriver.Chrome(ChromeDriverManager().install())
  driver.find_element_by_xpath(ESWD_TORNADO_XPATH).click()
  driver.find_element_by_xpath(START_DATE_XPATH).clear()
  driver.find_element_by_xpath(START_DATE_XPATH).send_keys(start_date)
  driver.find_element_by_xpath(END_DATE_XPATH).clear()
  driver.find_element_by_xpath(END_DATE_XPATH).send_keys(end_date)
  driver.find_element_by_xpath(SUBMIT_XPATH).click()
  reports_amount = driver.find_element_by_xpath(FIND_REPORTS_COUNT_XPATH).text
  reports_count = len(driver.find_elements_by_xpath(base_xpath))
  curr_list.append(driver.find_elements_by_xpath((base_xpath+curr_xpath))[i].text)
  child_item = driver.find_elements_by_xpath((f'({base_xpath})[{i+1}]{city_xpath}'))
  parent_item_string= driver.find_elements_by_xpath(base_xpath)[i].t

['Hessen', 'Wielkopolskie', 'over the sea. Islas Canarias - Tenerife', 'Tartus', 'Osmaniye Province', 'Antalya Province', 'over the Strait of Gibraltar.', 'over the sea. Hatay Province', 'Sahilkent, Salur and Hacıveliler areas. Antalya Province', 'Falarsaina Chania', 'Haifa / H̱efa', 'over the Adriatic Sea. Abruzzo', 'S of Salerno, over the sea. Campania', 'Muğla Province', 'Παξοί Paxoi', 'over Otterbach village and E of the village. Hessen', 'Enydreio Dodekánisos']
['51.64 N, 9.46 E', '52.23 N, 17.35 E', '28.50 N, 16.45 W', '35.21 N, 35.96 E', '37.48 N, 35.98 E', '36.47 N, 30.12 E', '36.09 N, 5.34 W', '36.62 N, 36.10 E', '36.35 N, 30.22 E', '35.50 N, 23.58 E', '32.81 N, 34.92 E', '42.04 N, 14.76 E', '40.60 N, 14.80 E', '37.41 N, 27.66 E', '39.24 N, 20.09 E', '50.69 N, 9.09 E', '36.46 N, 28.22 E']
['NULL', 'F1', 'NULL', 'F0', 'F1', 'F0', 'NULL', 'NULL', 'F1', 'NULL', 'NULL', 'NULL', 'NULL', 'F1', 'NULL', 'F1', 'NULL']
