In [160]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from fuzzywuzzy import process

In [161]:
r = requests.get("https://www.worldlifeexpectancy.com/", headers={'User-Agent': 'Mozilla/5.0'})

In [164]:
soup = BeautifulSoup(r.content, "html.parser")
countries = soup.find(id="country").find_all('option', class_='wle_select')

countryList = []
for country in countries:
    countryList.append(country.attrs['value'])
    
countryList.remove('world')
print(countryList)

['afghanistan', 'albania', 'algeria', 'angola', 'antigua-and-barbuda', 'arab-emirates', 'argentina', 'armenia', 'australia', 'austria', 'azerbaijan', 'bahamas', 'bahrain', 'bangladesh', 'barbados', 'belarus', 'belgium', 'belize', 'benin', 'bhutan', 'bolivia', 'bosnia-herzeg', 'botswana', 'brazil', 'brunei', 'bulgaria', 'burkina-faso', 'burundi', 'cambodia', 'cameroon', 'canada', 'cape-verde', 'central-africa', 'chad', 'chile', 'china', 'colombia', 'comoros', 'congo', 'costa-rica', 'cote-d-ivoire', 'croatia', 'cuba', 'cyprus', 'czech-republic', 'denmark', 'djibouti', 'dominican-rep', 'dr-congo', 'ecuador', 'egypt', 'el-salvador', 'equatorial-guinea', 'eritrea', 'estonia', 'ethiopia', 'fiji', 'finland', 'france', 'gabon', 'gambia', 'georgia', 'germany', 'ghana', 'greece', 'grenada', 'guatemala', 'guinea', 'guinea-bissau', 'guyana', 'haiti', 'honduras', 'hungary', 'iceland', 'india', 'indonesia', 'iran', 'iraq', 'ireland', 'israel', 'italy', 'jamaica', 'japan', 'jordan', 'kazakhstan', 'ke

In [78]:
countrySoups = {}
for country in countryList:
    r = requests.get("https://www.worldlifeexpectancy.com/country-health-profile/"+country, headers={'User-Agent': 'Mozilla/5.0'})
    countrySoups[country] = BeautifulSoup(r.content, "html.parser")

In [165]:
death_causes = {}
for country in countryList:
    table = countrySoups[country].find('div', class_='lhc_cause_total_section')
    names = table.find_all('td', class_='lhc_name')
    values = table.find_all('div', class_='lhc_type_hidden')

    death_causes[country] = {}
    for i in range(len(names)):
        death_causes[country][names[i].text] = values[2*i+1].text

In [166]:
pd.DataFrame(countryList, columns=['Country']).to_csv('./WebScraped/countryList.csv')

for country in countryList:
    df = pd.DataFrame(list(death_causes[country].items()), columns=['DeathCause', 'Percentage'])
    df.set_index('DeathCause')
    df.to_csv('./WebScraped/DeathCauses/'+country+'.csv')

In [167]:
# Assigning ISO 3 codes manually? No thank you.
codes = pd.read_csv('./WebScraped/countryISO3Codes.csv')

matches = {c:process.extractOne(c, codes['Country'])[0] for c in countryList}
ISO3 = [codes[codes['Country'] == matches[c]]['ISO3'].values[0] for c in countryList]
print(ISO3)
print(countryList)

['AFG', 'ALB', 'DZA', 'AGO', 'ATG', 'ARE', 'ARG', 'ARM', 'AUS', 'AUT', 'AZE', 'BHS', 'BHR', 'BGD', 'BRB', 'BLR', 'BEL', 'BLZ', 'BEN', 'BTN', 'BOL', 'BIH', 'BWA', 'BRA', 'BRN', 'BGR', 'BFA', 'BDI', 'KHM', 'CMR', 'CAN', 'CPV', 'CAF', 'TCD', 'CHL', 'CHN', 'COL', 'COM', 'COG', 'CRI', 'CIV', 'HRV', 'CUB', 'CYP', 'CZE', 'DNK', 'DJI', 'DMA', 'COG', 'ECU', 'EGY', 'SLV', 'GNQ', 'ERI', 'EST', 'ETH', 'FJI', 'FIN', 'FRA', 'GAB', 'GMB', 'GEO', 'DEU', 'GHA', 'GRC', 'GRD', 'GTM', 'GIN', 'GNB', 'GUY', 'HTI', 'HND', 'HUN', 'ISL', 'IND', 'IDN', 'IRN', 'IRQ', 'IRL', 'ISR', 'ITA', 'JAM', 'JPN', 'JOR', 'KAZ', 'KEN', 'KIR', 'KWT', 'KGZ', 'BRB', 'LVA', 'LBN', 'LSO', 'LBR', 'LBY', 'LTU', 'LUX', 'MDG', 'MWI', 'MYS', 'MDV', 'MLI', 'MLT', 'MRT', 'MUS', 'MEX', 'FSM', 'MDA', 'MNG', 'MNE', 'MAR', 'MOZ', 'MMR', 'NAM', 'NPL', 'NLD', 'GIN', 'NZL', 'NIC', 'NER', 'NGA', 'MKD', 'PRK', 'NOR', 'OMN', 'PAK', 'PAN', 'PRY', 'PER', 'PHL', 'POL', 'PRT', 'QAT', 'ROU', 'RUS', 'RWA', 'LCA', 'VCT', 'WSM', 'STP', 'SAU', 'SEN', 'SRB'

In [168]:
pd.DataFrame({'Country':countryList , 'Code':ISO3}).to_csv('./WebScraped/ScrapedNames2Codes.csv')