In [None]:
import re
import time
import requests
import numpy as np
import pandas as pd
from lxml import html
from bs4 import BeautifulSoup
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager

In [None]:
# auxilary functions for creating the web driver and closing it
# global variable initialization
YEARS_TO_COUNT = ['1994', '1995', '1996', '1997', '1998', '1999', '2000', '2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017']
def create_web_driver():
    return webdriver.Chrome(ChromeDriverManager().install())

def start_web_driver(driver, url):
    driver.maximize_window()
    driver.get(url)

def close_web_driver(driver):
    driver.close()

def log(message,filename):
    with open(f'{filename}', 'a') as f:
        f.write(message + '\n')


In [None]:
# global variable initialization for NOAA
NOAA_URL = 'https://www.ncdc.noaa.gov/stormevents/choosedates.jsp?statefips=-999,ALL'
NOAA_BEGINMONTH_XPATH = "//select[@id='beginDate_mm']//option[contains(text(),'"
NOAA_BEGINYEAR_XPATH = "//select[@id='beginDate_yyyy']//option[contains(text(),'"
NOAA_BEGINDAY_XPATH = "//select[@id='beginDate_dd']//option[contains(text(),'01')]"
NOAA_ENDMONTH_XPATH = "//select[@id='endDate_mm']//option[contains(text(),'"
NOAA_ENDYEAR_XPATH = "//select[@id='endDate_yyyy']//option[contains(text(),'"
NOAA_ENDDAY_XPATH = "//select[@id='endDate_dd']//option[contains(text(),'"
NOAA_EVENTTYPE_XPATH = "//select[@id='eventType']//option[contains(@value,'Tornado')]"
NOAA_SEARCH_XPATH = "//input[@value='Search']"
NOAA_RETURN_XPATH = '//*[@id="anch_8"]'


In [None]:
# functions for NOAA to parse the relevant date and input it in the relevant fields of the web driver received by input_date
def noaa_input_start_date(driver, year, month):
    driver.find_element_by_xpath(f"{NOAA_BEGINMONTH_XPATH}{month}')]").click()
    driver.find_element_by_xpath(f"{NOAA_BEGINMONTH_XPATH}{month}')]").click()
    driver.find_element_by_xpath(f"{NOAA_BEGINYEAR_XPATH}{year}')]").click()
    driver.find_element_by_xpath(f"{NOAA_BEGINYEAR_XPATH}{year}')]").click()
    driver.find_element_by_xpath(f"{NOAA_BEGINDAY_XPATH}").click()
    driver.find_element_by_xpath(f"{NOAA_BEGINDAY_XPATH}").click()

def noaa_input_end_date(driver, curr_year, end_month):
    driver.find_element_by_xpath(f"{NOAA_ENDMONTH_XPATH}{end_month}')]").click()
    driver.find_element_by_xpath(f"{NOAA_ENDMONTH_XPATH}{end_month}')]").click()
    driver.find_element_by_xpath(f"{NOAA_ENDYEAR_XPATH}{curr_year}')]").click()
    driver.find_element_by_xpath(f"{NOAA_ENDYEAR_XPATH}{curr_year}')]").click()
    if end_month == 2:
        end_day = 28
    elif end_month in [4,6,7,9,11]:
        end_day = 30
    else:
        end_day = 31
    driver.find_element_by_xpath(f"{NOAA_ENDDAY_XPATH}{end_day}')]").click()
    driver.find_element_by_xpath(f"{NOAA_ENDDAY_XPATH}{end_day}')]").click()
    


In [None]:
# auxilary functions to click the relevant fields in NOAA web driver
def noaa_start_search(driver):
    driver.find_element_by_xpath(NOAA_SEARCH_XPATH).click()

def noaa_input_event_type(driver):
    driver.find_element_by_xpath(NOAA_EVENTTYPE_XPATH).click()

def noaa_return_to_home(driver):
    driver.find_element_by_xpath(NOAA_RETURN_XPATH).click()


In [None]:
# function to get text from the element in the main result page of NOAA
def get_web_element_from_result_page(driver, xpath, index,base_xpath = "//table[@id='results']//tr[position()>2 and position()<last()]"):
    return driver.find_element_by_xpath(f"{base_xpath}[{index+1}]{xpath}")

In [None]:
def move_to_inner_page(driver, xpath = '//td[1]//@href'):
    driver.find_element_by_xpath(xpath).click()

def return_from_inner_page(driver, xpath = "//*[@id='anch_9']"):
    driver.find_element_by_xpath(xpath).click()
    
def get_inner_element_text(driver, xpath):
    return driver.find_element_by_xpath(xpath).text

In [None]:
def scrape_noaa(driver, curr_year, noaa_city_list, noaa_district_list, noaa_country_list, noaa_date_list, noaa_time_list, noaa_scale_list, noaa_deaths, noaa_length_list, noaa_width_list,lat_lon_list,noaa_years_list):
    noaa_base_xpath = "//table[@id='results']//tr[position()>2 and position()<last()]"
    noaa_city_xpath = "/td[1]//a"
    noaa_district_xpath = "/td[2]"
    noaa_date_xpath = "/td[4]"
    noaa_time_xpath = "/td[5]"
    noaa_scale_xpath = "/td[8]"
    noaa_deaths_xpath = "/td[9]"
    return_from_inner_page_xpath = "//*[@id='anch_9']"
    # inner_items_xpath = "//table//tr[position() > 3 and position() < 7]/td[2]"
    length_inner_xpath = "//table//tr/td[contains(text(), 'Length')]//following-sibling::td"
    width_inner_xpath = "//table//tr/td[contains(text(), 'Width')]//following-sibling::td"
    country_inner_xpath = "//table//tr/td[contains(text(), 'State')]//following-sibling::td"
    begin_lat_lon_xpath = "//table//tr/td[contains(text(), 'Begin Lat/Lon')]//following-sibling::td"
    # noaa_click_to_inner_page_xpath = "//td[1]//@href"
    ''' this will probably crash when the report number is 0, need to fix this'''
    reports_count = len(driver.find_elements_by_xpath(f'{noaa_base_xpath}'))
    for i in range(reports_count):
        noaa_years_list.append(curr_year)
        noaa_city_list.append(get_web_element_from_result_page(driver, noaa_city_xpath, i).text)
        noaa_district_list.append(get_web_element_from_result_page(driver, noaa_district_xpath, i).text)
        noaa_date_list.append(get_web_element_from_result_page(driver, noaa_date_xpath, i).text)
        noaa_time_list.append(get_web_element_from_result_page(driver, noaa_time_xpath, i).text)
        noaa_scale_list.append(get_web_element_from_result_page(driver, noaa_scale_xpath, i).text)
        noaa_deaths.append(get_web_element_from_result_page(driver, noaa_deaths_xpath, i).text)
        get_web_element_from_result_page(driver, noaa_city_xpath, i).click()
        noaa_length_list.append(get_inner_element_text(driver, length_inner_xpath))
        noaa_width_list.append(get_inner_element_text(driver, width_inner_xpath))
        noaa_country_list.append(get_inner_element_text(driver, country_inner_xpath))
        lat_lon_list.append(get_inner_element_text(driver, begin_lat_lon_xpath))
        return_from_inner_page(driver, return_from_inner_page_xpath)
    return noaa_city_list, noaa_district_list, noaa_country_list, noaa_date_list, noaa_time_list, noaa_scale_list, noaa_deaths, noaa_length_list, noaa_width_list,lat_lon_list,noaa_years_list

In [None]:
noaa_city_list, noaa_district_list, noaa_country_list, noaa_date_list, noaa_time_list, noaa_scale_list, noaa_deaths, noaa_length_list, noaa_width_list, lat_lon_list, noaa_years_list  = [], [], [], [], [], [], [] , [], [],[],[]
driver_noaa = create_web_driver()
start_web_driver(driver_noaa, NOAA_URL)
# noaa_locations = driver_noaa.find_elements_by_xpath()
# noaa_report_count = len(noaa_locations)
for year in YEARS_TO_COUNT:
    for month in range(1,3):
        noaa_input_start_date(driver_noaa, year, month)
        noaa_input_end_date(driver_noaa, year, month)
        noaa_input_event_type(driver_noaa)
        noaa_start_search(driver_noaa)
        noaa_city_list, noaa_district_list, noaa_country_list, noaa_date_list, noaa_time_list, noaa_scale_list, noaa_deaths, noaa_length_list, noaa_width_list, lat_lon_list, noaa_years_list = scrape_noaa(driver_noaa, year, noaa_city_list, noaa_district_list, noaa_country_list, noaa_date_list, noaa_time_list, noaa_scale_list, noaa_deaths, noaa_length_list, noaa_width_list,lat_lon_list,noaa_years_list)
        noaa_return_to_home(driver_noaa)
usdf = pd.DataFrame({'City': noaa_city_list, 'District': noaa_district_list, 'Country': noaa_country_list, 'Date': noaa_date_list, 'Time': noaa_time_list, 'Scale': noaa_scale_list, 'Deaths': noaa_deaths, 'Length': noaa_length_list, 'Width': noaa_width_list, 'Lat Lon': lat_lon_list, 'Year': noaa_years_list})
usdf