In [None]:
import time
import requests
import numpy as np
import pandas as pd
from lxml import html
from bs4 import BeautifulSoup
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager


In [None]:
# global variable initialization
ESWD_URL = 'https://eswd.eu/cgi-bin/eswd.cgi'
# WIKI_URL = 'https://en.wikipedia.org/wiki/List_of_European_tornadoes_and_tornado_outbreaks'
NOAA_URL = 'https://www.ncdc.noaa.gov/stormevents/choosedates.jsp?statefips=-999,ALL'
ESWD_TORNADO_XPATH = '//*[@name="TORNADO"]'
# YEARS_TO_COUNT = ['1994', '1995', '1996', '1997', '1998', '1999', '2000', '2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017']
YEARS_TO_COUNT = ['1994']
MONTHS_TO_COUNT = ['01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12']
START_DATE_XPATH = '//*[@id="start_date"]'
END_DATE_XPATH = '//*[@id="end_date"]'
FIND_REPORTS_COUNT_XPATH = "//p[contains(text(),'selected reports')] | //p[contains(text(),'no reports')]"
SUBMIT_XPATH = '(//*[@value="submit query"])[2]'

In [None]:
# global variable initialization for NOAA
NOAA_BEGINMONTH_XPATH = "//select[@id='beginDate_mm']//option[contains(text(),'"
NOAA_BEGINYEAR_XPATH = "//select[@id='beginDate_yyyy']//option[contains(text(),'"
NOAA_BEGINDAY_XPATH = "//select[@id='beginDate_dd']//option[contains(text(),'01')]"
NOAA_ENDMONTH_XPATH = "//select[@id='endDate_mm']//option[contains(text(),'"
NOAA_ENDYEAR_XPATH = "//select[@id='endDate_yyyy']//option[contains(text(),'"
NOAA_ENDDAY_XPATH = "//select[@id='endDate_dd']//option[contains(text(),'"
NOAA_EVENTTYPE_XPATH = "//select[@id='eventType']//option[contains(@value,'Tornado')]"
NOAA_SEARCH_XPATH = "//input[@value='Search']"
NOAA_RETURN_XPATH = '//*[@id="anch_8"]'


In [None]:
# auxilary functions for creating the web driver and closing it
def create_web_driver():
    return webdriver.Chrome(ChromeDriverManager().install())

def start_web_driver(driver, url):
    driver.maximize_window()
    driver.get(url)

def close_web_driver(driver):
    driver.close()

def log(message,filename):
    with open(f'{filename}', 'a') as f:
        f.write(message + '\n')


In [None]:
# functions for NOAA to parse the relevant date and input it in the relevant fields of the web driver received by input_date
def noaa_input_start_date(driver, year, month):
    driver.find_element_by_xpath(f"{NOAA_BEGINMONTH_XPATH}{month}')]").click()
    driver.find_element_by_xpath(f"{NOAA_BEGINMONTH_XPATH}{month}')]").click()
    driver.find_element_by_xpath(f"{NOAA_BEGINYEAR_XPATH}{year}')]").click()
    driver.find_element_by_xpath(f"{NOAA_BEGINYEAR_XPATH}{year}')]").click()
    driver.find_element_by_xpath(f"{NOAA_BEGINDAY_XPATH}").click()
    driver.find_element_by_xpath(f"{NOAA_BEGINDAY_XPATH}").click()

def noaa_input_end_date(driver, curr_year, end_month):
    driver.find_element_by_xpath(f"{NOAA_ENDMONTH_XPATH}{end_month}')]").click()
    driver.find_element_by_xpath(f"{NOAA_ENDMONTH_XPATH}{end_month}')]").click()
    driver.find_element_by_xpath(f"{NOAA_ENDYEAR_XPATH}{curr_year}')]").click()
    driver.find_element_by_xpath(f"{NOAA_ENDYEAR_XPATH}{curr_year}')]").click()
    if end_month == 2:
        end_day = 28
    elif end_month == 4 or end_month == 6 or end_month == 9 or end_month == 11:
        end_day = 30
    else:
        end_day = 31
    driver.find_element_by_xpath(f"{NOAA_ENDDAY_XPATH}{end_day}')]").click()
    driver.find_element_by_xpath(f"{NOAA_ENDDAY_XPATH}{end_day}')]").click()
    


In [None]:
# auxilary functions to click the relevant fields in NOAA web driver
def noaa_start_search(driver):
    driver.find_element_by_xpath(NOAA_SEARCH_XPATH).click()

def noaa_input_event_type(driver):
    driver.find_element_by_xpath(NOAA_EVENTTYPE_XPATH).click()

def noaa_return_to_home(driver):
    driver.find_element_by_xpath(NOAA_RETURN_XPATH).click()


In [None]:
# ESWD auxilary functions for date parsing
def get_end_date(curr_year, curr_month):
    if curr_month == '02':
        day = '28'
    elif curr_month == '04' or curr_month == '06' or curr_month == '09' or curr_month == '11':
        day = '30'
    else:
        day = '31'
    return f'{day}-{curr_month}-{curr_year}'

def get_start_date(curr_year,curr_month):
    return f'01-{curr_month}-{curr_year}'

def input_date(driver, start_date, end_date, start_date_xpath, end_date_xpath):
    driver.find_element_by_xpath(start_date_xpath).clear()
    time.sleep(0.5)
    driver.find_element_by_xpath(start_date_xpath).send_keys(start_date)
    driver.find_element_by_xpath(end_date_xpath).clear()
    time.sleep(0.5)
    driver.find_element_by_xpath(end_date_xpath).send_keys(end_date)


In [None]:
# function to get text from the element in the main result page of NOAA
def get_web_element_from_result_page(driver, xpath, index,base_xpath = "//table[@id='results']//tr[position()>2 and position()<last()]"):
    return driver.find_element_by_xpath(f"{base_xpath}[{index+1}]{xpath}")

In [None]:
def move_to_inner_page(driver, xpath = '//td[1]//@href'):
    driver.find_element_by_xpath(xpath).click()

def return_from_inner_page(driver, xpath = "//*[@id='anch_9']"):
    driver.find_element_by_xpath(xpath).click()
    
def get_inner_element_text(driver, xpath):
    return driver.find_element_by_xpath(xpath).text

In [None]:
def scrape_noaa(driver, curr_year, noaa_city_list, noaa_district_list, noaa_country_list, noaa_date_list, noaa_time_list, noaa_scale_list, noaa_deaths, noaa_length_list, noaa_width_list,lat_lon_list,noaa_years_list):
    noaa_base_xpath = "//table[@id='results']//tr[position()>2 and position()<last()]"
    noaa_city_xpath = "/td[1]//a"
    noaa_district_xpath = "/td[2]"
    noaa_date_xpath = "/td[4]"
    noaa_time_xpath = "/td[5]"
    noaa_scale_xpath = "/td[8]"
    noaa_deaths_xpath = "/td[9]"
    return_from_inner_page_xpath = "//*[@id='anch_9']"
    # inner_items_xpath = "//table//tr[position() > 3 and position() < 7]/td[2]"
    length_inner_xpath = "//table//tr/td[contains(text(), 'Length')]//following-sibling::td"
    width_inner_xpath = "//table//tr/td[contains(text(), 'Width')]//following-sibling::td"
    country_inner_xpath = "//table//tr/td[contains(text(), 'State')]//following-sibling::td"
    begin_lat_lon_xpath = "//table//tr/td[contains(text(), 'Begin Lat/Lon')]//following-sibling::td"
    # noaa_click_to_inner_page_xpath = "//td[1]//@href"
    reports_count = len(driver.find_elements_by_xpath(f'{noaa_base_xpath}'))
    for i in range(reports_count):
        noaa_years_list.append(curr_year)
        noaa_city_list.append(get_web_element_from_result_page(driver, noaa_city_xpath, i).text)
        noaa_district_list.append(get_web_element_from_result_page(driver, noaa_district_xpath, i).text)
        noaa_date_list.append(get_web_element_from_result_page(driver, noaa_date_xpath, i).text)
        noaa_time_list.append(get_web_element_from_result_page(driver, noaa_time_xpath, i).text)
        noaa_scale_list.append(get_web_element_from_result_page(driver, noaa_scale_xpath, i).text)
        noaa_deaths.append(get_web_element_from_result_page(driver, noaa_deaths_xpath, i).text)
        get_web_element_from_result_page(driver, noaa_city_xpath, i).click()
        noaa_length_list.append(get_inner_element_text(driver, length_inner_xpath))
        noaa_width_list.append(get_inner_element_text(driver, width_inner_xpath))
        noaa_country_list.append(get_inner_element_text(driver, country_inner_xpath))
        lat_lon_list.append(get_inner_element_text(driver, begin_lat_lon_xpath))
        return_from_inner_page(driver, return_from_inner_page_xpath)
    return noaa_city_list, noaa_district_list, noaa_country_list, noaa_date_list, noaa_time_list, noaa_scale_list, noaa_deaths, noaa_length_list, noaa_width_list,lat_lon_list,noaa_years_list

In [None]:
noaa_city_list, noaa_district_list, noaa_country_list, noaa_date_list, noaa_time_list, noaa_scale_list, noaa_deaths, noaa_length_list, noaa_width_list, lat_lon_list, noaa_years_list  = [], [], [], [], [], [], [] , [], [],[],[]
driver_noaa = create_web_driver()
start_web_driver(driver_noaa, NOAA_URL)
# noaa_locations = driver_noaa.find_elements_by_xpath()
# noaa_report_count = len(noaa_locations)
for year in YEARS_TO_COUNT:
    for month in range(1,3):
        noaa_input_start_date(driver_noaa, year, month)
        noaa_input_end_date(driver_noaa, year, month)
        noaa_input_event_type(driver_noaa)
        noaa_start_search(driver_noaa)
        noaa_city_list, noaa_district_list, noaa_country_list, noaa_date_list, noaa_time_list, noaa_scale_list, noaa_deaths, noaa_length_list, noaa_width_list, lat_lon_list, noaa_years_list = scrape_noaa(driver_noaa, year, noaa_city_list, noaa_district_list, noaa_country_list, noaa_date_list, noaa_time_list, noaa_scale_list, noaa_deaths, noaa_length_list, noaa_width_list,lat_lon_list,noaa_years_list)
        noaa_return_to_home(driver_noaa)
usdf = pd.DataFrame({'City': noaa_city_list, 'District': noaa_district_list, 'Country': noaa_country_list, 'Date': noaa_date_list, 'Time': noaa_time_list, 'Scale': noaa_scale_list, 'Deaths': noaa_deaths, 'Length': noaa_length_list, 'Width': noaa_width_list, 'Lat Lon': lat_lon_list, 'Year': noaa_years_list})
usdf

In [40]:
def convert_miles_to_km(miles):
    return round(miles * 1.60934, 2)

def convert_yards_to_meters(yards):
    return round(yards * 0.9144, 2)

def create_date_in_df(df):
    df.BEGIN_YEARMONTH = df.BEGIN_YEARMONTH.astype(str)
    df.BEGIN_DAY = df.BEGIN_DAY.astype(str)
    df['Year'] = df.BEGIN_YEARMONTH.str[:4]
    df['Month'] = df.BEGIN_YEARMONTH.str[4:]
    df['Date'] = df.iloc[:,1].str.cat(df.iloc[:,13], sep='/').str.cat(df.iloc[:,12], sep='/')
    df.drop(columns=['BEGIN_YEARMONTH', 'BEGIN_DAY', 'Month', 'Year'], inplace=True)
    return df

In [41]:
start_csv_xpath = "//a[contains(@href, 'details-ftp_v1.0_d"
end_csv_xpath = "')]"
rename_dict = {"STATE": "Country","CZ_NAME":"District", "TOR_F_SCALE": "Scale", "TOR_LENGTH":"Length","TOR_WIDTH":"Width","BEGIN_LOCATION":"City","BEGIN_LAT":"Latitude","BEGIN_LON":"Longtitude", "DEATHS_DIRECT": "Deaths", "YEAR": "Year"}
col_list = ["BEGIN_YEARMONTH", "BEGIN_DAY", "STATE", "EVENT_TYPE", "CZ_NAME", "BEGIN_DATE_TIME", "DEATHS_DIRECT", "TOR_F_SCALE", "TOR_LENGTH", "TOR_WIDTH", "BEGIN_LOCATION", "BEGIN_LAT", "BEGIN_LON"]
df_list = []
csv_link = "https://www1.ncdc.noaa.gov/pub/data/swdi/stormevents/csvfiles/"
csv_driver = create_web_driver()
start_web_driver(csv_driver, csv_link)
time.sleep(1)
for i in range(1994,2018):
    df_to_append = pd.read_csv(csv_link + csv_driver.find_element_by_xpath(f"{start_csv_xpath}{i}{end_csv_xpath}").text, compression='gzip', usecols=col_list)
    df_to_append.rename(columns=rename_dict, inplace=True)
    split_df = df_to_append.BEGIN_DATE_TIME.str.split(' ', expand=True)
    df_to_append = df_to_append[df_to_append['EVENT_TYPE'] == 'Tornado'].drop(columns=['EVENT_TYPE', "BEGIN_DATE_TIME"])
    df_to_append['Time'] = split_df[1].str[:5]
    df_to_append = create_date_in_df(df_to_append)
    df_to_append["Length"] = df_to_append.Length.apply(convert_miles_to_km)
    df_to_append["Width"] = df_to_append.Width.apply(convert_yards_to_meters)
    df_list.append(df_to_append)
csv_df = pd.concat(df_list, ignore_index=True)
csv_df = csv_df[["District","City","Country","Longtitude", "Latitude" ,"Date","Time","Scale","Length", "Width", "Deaths"]]
csv_df.to_csv('noaa_data.csv')



Current google-chrome version is 96.0.4664
Get LATEST chromedriver version for 96.0.4664 google-chrome
Driver [/Users/elad/.wdm/drivers/chromedriver/mac64/96.0.4664.45/chromedriver] found in cache
  return webdriver.Chrome(ChromeDriverManager().install())
  df_to_append = pd.read_csv(csv_link + csv_driver.find_element_by_xpath(f"{start_csv_xpath}{i}{end_csv_xpath}").text, compression='gzip', usecols=col_list)


In [None]:
# function to accumulate the data from the site in lists to prepare for dataframe creation using other auxilary functions
def get_data_from_page(driver,year,district, city, country, coordinates, date, times, scale,length,years):
    base_xpath ="//td[@class='base_info']/p[b and not(@class='smallgray')]"
    district_xpath = "/b[1]"
    bold_tags_xpath = "/b"
    country_xpath = "/b[2]"
    date_xpath = "/b[3]"
    time_xpath = "/b[4]"
    scale_xpath_1 = "(//p[@class='TORNADO detail_info_entry'])"
    scale_xpath_2 = "/b[contains(text(),'F')]" 
    scale_xpath_3 = "((//p[@class='TORNADO'][contains(b,'tornado')])"
    scale_xpath_4 = "/b[text()='tornado'])"
    reports_count = len(driver.find_elements_by_xpath(base_xpath))
    len_elements = driver.find_elements_by_xpath(scale_xpath_1)
    parent_items= driver.find_elements_by_xpath(base_xpath)
    for i in range(reports_count):
        years.append(year)
        district = get_elements_from_list(driver,district_xpath,base_xpath,district,i)
        child_item = driver.find_elements_by_xpath((f'({base_xpath})[{i+1}]{bold_tags_xpath}'))
        child_item_text = get_text_from_element_list(child_item)
        city= get_city_list(child_item_text,parent_items[i].text,city,0)
        country = get_elements_from_list(driver,country_xpath,base_xpath,country,i)
        coordinates =  get_coordinates_list(child_item_text,parent_items[i].text,coordinates,1)
        date = get_elements_from_list(driver,date_xpath,base_xpath,date,i)
        times = get_elements_from_list(driver,time_xpath,base_xpath,times,i)
        scale_elements_list = driver.find_elements_by_xpath((f'{scale_xpath_1}[{i+1}]{scale_xpath_2} | {scale_xpath_3}[{i+1}]{scale_xpath_4}'))
        scale = get_scale_list(scale,scale_elements_list)
        length = get_len_list(child_item_text,parent_items[i].text,length,1,len_elements[i])
    return district, city, country, coordinates, date, times, scale,length,years


In [None]:
# auxilary functions in charge of initial parsing and cleaning of the data captured by XPATH
def replace_child_text(child_text,field):
    for word in child_text.split(' '):
        field = field.replace('\n','').replace(word,'',1).lstrip()
    return field

def checks_split_position(field,selector):
    if field[0] == '(':
        selector = selector + 1
    return selector

def checks_digit(field,selector):
    return field.split('(')[selector][0].isdigit()

def checks_len(field):
    return '<' in field
    
def get_text_from_element_list(elem_list):
    text = ''
    for elem in elem_list:
        text += elem.text + ' '
    return text


In [None]:
# auxilary functions to clean the data further and appending it to the relevant lists
def append_coordinates_list(field,curr_list,position):
    if checks_len(field):
        curr_list.append(field.split('(')[position].split(')')[0].rstrip())
    else:
        curr_list.append(field.split('(')[position].rstrip().replace(')',''))
    return curr_list

def get_coordinates_list(child_text,field,curr_list,selector):
    field = replace_child_text(child_text,field)
    position = checks_split_position(field, selector)
    if checks_digit(field,position):
            curr_list = append_coordinates_list(field,curr_list,position)
    else:
        if checks_digit(field,position-1):
            curr_list = append_coordinates_list(field,curr_list,position-1)
        else:
            curr_list.append(np.nan)
    return curr_list


In [None]:
# auxilary functions to clean the data further and appending it to the relevant lists
'''if the size of the elements list is 1 it gets only "tornado" dummy text, so there is no scale in the text, otherwise there is scale 
in the text so take the relevant F scale'''
def get_scale_list(scale,scale_elements_list):
    if len(scale_elements_list) == 1:
        scale.append(np.nan)
    else:
        for element in scale_elements_list:
            if 'F' in element.text:
                scale.append(element.text)
    return scale
'''if there's a field in the right textbox with specified path length get the number presented in it, otherwise check if the first char is a digit, if so check if the field on the left 
    has the "<" char, which indicates length, if so take it, otherwise input nan in the list'''
def get_len_list(child_text,field,curr_list,selector,element):
    if 'path length:' in element.text:
        curr_list.append(element.text.split('path length:')[1].split('k')[0].strip()) 
    else:
        field = replace_child_text(child_text,field)
        if checks_digit(field,selector):
            if checks_len(field):
                curr_list.append(field.split('<')[1].split('k')[0].strip()) 
            else:
                curr_list.append(np.nan)
        else:
            curr_list.append(np.nan)
    return curr_list
'''if the first character is a digit put nan, otherwise take the city name'''
def get_city_list(child_text,field,curr_list,selector):
    field = replace_child_text(child_text,field)
    position = checks_split_position(field, selector)
    if checks_digit(field,position):
        curr_list.append(np.nan)
    else:
        curr_list.append(field.split('(')[position].rstrip().replace(')',''))
    return curr_list

In [None]:
#auxilary functions to input bold text to the relevant lists and create a df using the lists
def get_elements_from_list(driver,curr_xpath,base_xpath,curr_list,i):
    curr_list.append(driver.find_elements_by_xpath((base_xpath+curr_xpath))[i].text)
    return curr_list

def get_df_from_lists(district, city, country, coordinates, date, times, scale,length,year):
    df = pd.DataFrame({'district':district,'city':city,'country':country,'coordinates':coordinates,'date':date,'time':times,'scale':scale,'length (km)':length,'year':year})
    return df

In [None]:
# main function to crawl the data from the website and create a df
driver = create_web_driver()
start_web_driver(driver,ESWD_URL)
driver.find_element_by_xpath(ESWD_TORNADO_XPATH).click()
district, city, country, coordinates, date, times, scale,length,years = [], [], [], [], [], [], [], [],[]
for year in YEARS_TO_COUNT:
    for month in MONTHS_TO_COUNT:
        end_date = get_end_date(year,month)
        start_date = get_start_date(year, month)
        input_date(driver, start_date, end_date,START_DATE_XPATH,END_DATE_XPATH)
        driver.find_element_by_xpath(SUBMIT_XPATH).click()
        reports_amount = driver.find_element_by_xpath(FIND_REPORTS_COUNT_XPATH).text
        log(f'{year}-{month}-{reports_amount}', 'amount_of_reports.log')
        if "no report" in reports_amount:
            continue
        time.sleep(2)
        district, city, country, coordinates, date, times, scale,length,years = get_data_from_page(driver,year,district, city, country, coordinates, date, times, scale,length,years)
        edf = get_df_from_lists(district, city, country, coordinates, date, times, scale,length,years)
edf.to_csv(f'ESWD_data.csv')
close_web_driver(driver)