In [27]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import unicodedata
import re
from map_countries import map_countries


In [28]:
def scrape_gdp_pp():
    url = "https://en.wikipedia.org/wiki/List_of_countries_by_GDP_(PPP)_per_capita"
    my_page = requests.get(url)
    soup = BeautifulSoup(my_page.content, "lxml")
    tables = soup.find_all('table',{'class':'wikitable sortable'})
    country_rows = tables[0].findAll('tr')
    countries_data = []

    for country in country_rows[1:]:    
        country_data = country.findAll('td')
        name = country_data[1].text
        country_name = unicodedata.normalize("NFKD", name)
        country_name = country_name.strip()
        gdp_pp = int(country_data[2].text.strip().replace(',',''))
#         if (country_name in worldcup_teams):
        countries_data.append({"name":country_name,'gdp_pp':gdp_pp})

    return pd.DataFrame(countries_data)

In [29]:
gdp_df = scrape_gdp_pp()

In [30]:
gdp_df['name'] = gdp_df['name'].copy().apply(map_countries)

In [31]:
def scrape_temp():
    url = "https://en.wikipedia.org/wiki/List_of_countries_by_average_yearly_temperature"
    my_page = requests.get(url)
    soup = BeautifulSoup(my_page.content, "lxml")
    tables = soup.find_all('table',{'class':'wikitable sortable'})
    country_rows = tables[0].findAll('tr')
    countries_temp = []
    
    for country in country_rows[1:]:    
        country_data = country.findAll('td')
        name = country_data[0].text
        country_name = unicodedata.normalize("NFKD", name)
        country_name = country_name.strip()
        temp_str = country_data[1].text.strip()
        temp_str = temp_str.replace('−','-')
        temp = float(temp_str)  
#         if (country_name in worldcup_teams):
        countries_temp.append({"name":country_name,'temp':temp})
    return pd.DataFrame(countries_temp)


In [32]:
temps = scrape_temp()


In [33]:
temps['name'] = temps['name'].copy().apply(map_countries)

In [34]:
set(gdp_df.name.unique()) - set(temps.name.unique()) 

{'hong kong',
 'kosovo',
 'macau',
 'micronesia',
 'nauru',
 'puerto rico',
 'south sudan',
 'taiwan',
 'world[n 1]'}

In [35]:
set(temps.name.unique()) - set(gdp_df.name.unique())

{'andorra',
 'cuba',
 'federated states of micronesia',
 'korea dpr',
 'liechtenstein',
 'monaco',
 'somalia',
 'syria'}

In [36]:
combined_df = temps.merge(gdp_df,on='name', how = 'outer')

In [37]:
def scrape_population():
    url = "https://en.wikipedia.org/wiki/List_of_countries_by_population_(United_Nations)"
    my_page = requests.get(url)
    soup = BeautifulSoup(my_page.content, "lxml")
    tables = soup.find_all('table',{'class':'wikitable'})
    country_rows = tables[0].findAll('tr')

    countries_popu = []

    for country in country_rows[2:]:

        country_data = country.findAll('td')

        country_name_raw = country_data[1].text
        country_name = country_name_raw.strip()
        m = re.search('\[\w+\]', country_name_raw)
        if (m != None):
            citation = m.group(0)
            country_name = country_name.replace(citation, "")

        population = int(country_data[5].text.strip().replace(',',''))

#         if (country_name in worldcup_teams):
        countries_popu.append({"name":country_name,'population':population})
            
    return pd.DataFrame(countries_popu)

In [38]:
countries_popu = scrape_population()

In [39]:
countries_popu['name'] = countries_popu['name'].copy().apply(map_countries)

In [40]:
set(countries_popu.name.unique()) - set(combined_df.name.unique()) 

{'american samoa',
 'anguilla',
 'aruba',
 'bermuda',
 'british virgin islands',
 'caribbean netherlands',
 'cayman islands',
 'cook islands',
 'curaçao',
 'falkland islands',
 'faroe islands',
 'french guiana',
 'french polynesia',
 'gibraltar',
 'greenland',
 'guadeloupe',
 'guam',
 'guernsey and \xa0jersey',
 'isle of man',
 'martinique',
 'mayotte',
 'montserrat',
 'new caledonia',
 'niue',
 'northern mariana islands',
 'palestine',
 'réunion',
 'saint helena, ascension and tristan da cunha',
 'saint pierre and miquelon',
 'sint maarten',
 'tokelau',
 'turks and caicos islands',
 'united states virgin islands',
 'vatican city',
 'wallis and futuna',
 'western sahara'}

In [41]:
set(combined_df.name.unique()) - set(countries_popu.name.unique())

{'kosovo', 'micronesia', 'world[n 1]'}

In [42]:
combined_df = combined_df.merge(countries_popu, on = 'name', how = 'left')


In [43]:
combined_df = combined_df[~combined_df['population'].isnull()]

In [44]:
combined_df.to_csv('data/demographics.csv', index = False)