In [None]:
from selenium.webdriver import Chrome
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.select import Select
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from time import sleep, time
from tqdm import tqdm
import pandas as pd
import glob
import os

# start of execution time 
start_time = time() 

## functions

# date conversion
def date_conversion(date):
    years = set()
    for element in date:
        if element.isdigit() == True:
            years.add(int(element))
        else:
            for year_in_range in range(int(element[0:4]), int(element[-4::])+1):
                years.add(year_in_range)
    return sorted(list(years))

# age group selection
def select_age_groups():
    try:
        wait_1 = WebDriverWait(browser, 1)
        wait_1.until(EC.element_to_be_clickable((By.ID, 'Age_all')))
    except:
        wait.until(EC.element_to_be_clickable(
            (By.XPATH, '//*[@id="filter_clustered_column_chart_2_key"]/input'))).click()
        wait.until(EC.element_to_be_clickable(
            (By.XPATH, '//*[@id="filter_clustered_column_chart_2_key_results_list"]/span'))).click()

# country selection
def select_country(country):
    wait.until(EC.presence_of_all_elements_located(
        (By.CLASS_NAME, 'option-selected')))[9].click()
    try:
        wait.until(EC.element_to_be_clickable(
            (By.XPATH, '//*[@id="filter_clustered_column_chart_5_key"]/input'))).send_keys(country)
    except:
        wait.until(EC.element_to_be_clickable(
            (By.XPATH, '//*[@id="filter_clustered_column_chart_5_key"]/input'))).send_keys(country)
        
        # country search
        try:  
            wait_1 = WebDriverWait(browser, 1)
            wait_1.until(EC.element_to_be_clickable(
                (By.XPATH, '//*[@id="filter_clustered_column_chart_5_key_results_list"]')))  
            wait.until(EC.presence_of_all_elements_located(
                (By.TAG_NAME, 'input')))[4].click()
            wait.until(EC.element_to_be_clickable(
                (By.XPATH, '//*[@id="filter_clustered_column_chart_3_key"]/input'))).click()  
            wait.until(EC.element_to_be_clickable(
                (By.XPATH, '//*[@id="filter_clustered_column_chart_3_key_results_list"]/span'))).click()
            return True
        
        # country not found
        except:  
            wait.until(EC.element_to_be_clickable(
                (By.XPATH, '//*[@id="filter_clustered_column_chart_5_key"]/input'))).send_keys(
                Keys.CONTROL, 'a', Keys.BACKSPACE)
            wait.until(EC.element_to_be_clickable(
                (By.XPATH, '//*[@id="filter_clustered_column_chart_3_key"]/input'))).click()  
            wait.until(EC.element_to_be_clickable(
                (By.XPATH, '//*[@id="filter_clustered_column_chart_3_key_results_list"]/span'))).click()
            return False

# data type selection
def select_data_type():
    Select(wait.until(EC.element_to_be_clickable(
        (By.XPATH, '//*[@id="filter_item_clustered_column_chart_1_key"]/select')))).select_by_value(
        'filter_clustered_column_chart_11_key')

# year selection (the ability to enter a year not included in the dropdown is not accounted for)
def select_year(year):
    Select(wait.until(EC.presence_of_element_located(
        (By.XPATH, '//*[@id="filter_clustered_column_chart_7_key"]/select')))).select_by_visible_text(str(year))
    wait.until(EC.element_to_be_clickable(
        (By.XPATH, '//*[@id="filter_clustered_column_chart_3_key"]/input'))).click()  
    wait.until(EC.element_to_be_clickable(
        (By.XPATH, '//*[@id="filter_clustered_column_chart_3_key_results_list"]/span'))).click()

# filter application
def apply_filter():
    wait.until(EC.presence_of_element_located(
        (By.XPATH, '//*[@id="filters_section_clustered_column_chart_key_filter_button"]'))).click()

# data availability check
def data_availability_check():
    wait_1 = WebDriverWait(browser, 1)
    try:
        wait_1.until(EC.presence_of_element_located(
            (By.CLASS_NAME, 'no-data-available')))
        return False
    except:
        return True

# csv download
def download():
    try:
        wait.until(EC.element_to_be_clickable(
            (By.XPATH, '//*[@id="visualization_component_htmlid"]/div/div[1]/div[2]/button[2]'))).click()
        sleep(.5)
        wait.until(EC.element_to_be_clickable(
            (By.XPATH, '//*[@id="modal_content"]/button[1]'))).click()
        wait.until(EC.invisibility_of_element_located(
            (By.XPATH, '//*[@id="loading_modal_content"]')))
        return True
    except:
        return False

# moving and renaming csv file
def rename_file(country, year, old_place, new_place, cause_of_death):
    create_new_folder = new_place + f"\\{cause_of_death}"
    if not os.path.exists(create_new_folder):
        os.makedirs(create_new_folder)
    while True:
        try:
            file_folder = glob.glob(old_place + "\\*.csv")
            old_name = max(file_folder, key=os.path.getctime)
            if 'WHOMortalityDatabase_' in old_name:
                new_name = new_place + \
                    f"\\{cause_of_death}\\{country}_{str(year)}.csv"
                os.rename(old_name, new_name)
                break
            else:
                sleep(.5)
                continue
        except:
            sleep(.5)
            continue
    return new_name

# data check in the downloaded file
def check_downloaded_file(path_to_file , country, year):
    df = pd.read_csv(
        path_to_file , encoding='utf-8', skiprows=[i for i in range(6)], usecols=[i for i in range(12)])
    try:
        if df['Year'][0] == year and df['Country Name'][0] == country:
            pass
        else: 
            os.remove(path_to_file)
            return False
    except:
        os.remove(path_to_file)
        return False
    
    else:
        return True

# switch between site’s directories
def open_directory(directory):
    if directory != 'All causes':
        wait_2 = WebDriverWait(browser, 2)
        try:
            xpath_to_directory = f"//body//*[contains(text(), '{directory}')]"
            directory_element = wait_2.until(
                EC.presence_of_element_located((By.XPATH, xpath_to_directory)))
            href_to_directory = directory_element.get_attribute("href")
            browser.get(href_to_directory)
        except:
            return False
        else:
            return True
    else:
        browser.get(
            'https://platform.who.int/mortality/themes/theme-details/MDB/all-causes')
        return True

## main part

# configuration 
path_to_folder_with_ChromeDriver = "chromedriver.exe"
path_to_default_download_folder = r"C:\Users\avbru\Downloads"
path_to_new_folder_with_data = "tabs"
countries = ['Argentina', 'Australia', 'Belarus', 'Brazil', 'Chile', 'Colombia',
       'Dominican Republic', 'Ecuador', 'Egypt', 'Georgia', 'Germany',
       'Greece', 'Israel', 'Italy', 'Japan', 'Kazakhstan', 'Kuwait',
       'Kyrgyzstan', 'Malaysia', 'Mexico', 'Peru', 'Philippines',
       'Poland', 'Republic of Korea', 'Romania', 'Russian Federation',
       'South Africa', 'Spain', 'Sweden', 'Thailand',
       'United Kingdom of Great Britain and Northern Ireland',
       'United States of America', 'Uzbekistan']

causes_of_death = ['Cardiovascular diseases']

years = date_conversion(['2014', '2018'])

# opening browser
browser = Chrome(path_to_folder_with_ChromeDriver)
browser.get(
    'https://platform.who.int/mortality/themes/theme-details/MDB/all-causes')
wait = WebDriverWait(browser, 10)

# data download statistics
download_statistics = dict()  

# cause of death selection
for directory in tqdm(causes_of_death, leave=False, desc=f'download'): 
    # check if cause of death is filled
    if open_directory(directory) == False:
        download_statistics[f'{directory}'] = 'incorrect or missing cause of death'
        continue
    else:  
        # year selection
        for year in years:  
            # country selection
            for country in countries:  
                select_age_groups()
                # check if country is filled
                if select_country(country) == False:  
                    download_statistics[f'{country}'] = 'incorrect or missing country'
                    continue
                else: 
                    # download error protection
                    for n in range(5):  
                        # protection against unexpected errors on the site
                        try: 
                            select_year(year)
                            select_data_type()
                            sleep(n)
                            apply_filter()
                            sleep(n+2.5)
                            # data availability check
                            if data_availability_check() == True:  
                                # check if the download of the csv file is complete 
                                if download() == True:  
                                    sleep(n+1)
                                    download_result = 'done'
                                    # check downloaded csv file
                                    if check_downloaded_file(rename_file(
                                        country, year, path_to_default_download_folder, path_to_new_folder_with_data, \
                                        directory), country, year) == True: 
                                        break
                                    else:  
                                        sleep(3)
                                        browser.refresh()
                                        continue   
                                # csv file not downloaded
                                else:  
                                    download_result = 'error'
                                    browser.refresh()
                                    # retry download
                                    continue  
                            # no data
                            else:  
                                download_result = 'no data'
                                break
                        except:
                            sleep(5)
                            browser.refresh()
                            continue
                    # browser refresh
                    browser.refresh()
                    download_statistics[f'{country}_{directory}_{year}'] = download_result

# closing the browser 
browser.close()

# print statistics 
print(*download_statistics.items(), sep='\n')

# print elapsed time
print(f'time: {time() - start_time}')  