In [136]:
import BeautifulSoup, csv, re, subprocess
# !pip install BeautifulSoup

In [None]:
dir = 'LuxembourgIncomeStudy'
!mkdir -p $dir

In [146]:
README = """
Inequality and Poverty Key Figures from the Luxembourg Income Study

Downloaded from
http://www.lisdatacenter.org/lis-ikf-webapp/app/search-ikf-figures

and converted to CSV using LuxembourgIncomeStudy.ipynb
"""

open('%s/README.txt' % dir, 'w').write(README)

In [138]:
cmd = """curl
http://www.lisdatacenter.org/lis-ikf-webapp/app/search-ikf-figures
-H 'Pragma: no-cache' -H 'Origin: http://www.lisdatacenter.org'
-H 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.81 Safari/537.36'
-H 'Content-Type: application/x-www-form-urlencoded' -H 'Cache-Control: no-cache'
-H 'Referer: http://www.lisdatacenter.org/lis-ikf-webapp/app/search-ikf-figures'
-H 'Cookie: JSESSIONID=D4968A6FAB96533E24BC42EE534788E9; __utmt=1; __utma=212796297.324415436.1493049554.1493640200.1493642970.3; __utmb=212796297.7.10.1493642970; __utmc=212796297; __utmz=212796297.1493049554.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none)' --data '_datasetIdList=1&_indicatorIdList=1&fromsearch=fromsearch'
""".replace('\n', ' ')
body = subprocess.check_output(cmd, shell=True)


In [139]:
soup = BeautifulSoup.BeautifulSoup(body)
table = soup.findAll("table")[-1]
rows = table.findAll('tr')
print 'Found table with %d rows' % len(rows)
# Was 292 on 2017-May-01
assert len(rows) > 250

Found table with 292 rows


In [140]:
headings_html = table.findAll('tr')[0].findAll('th')[1:]
metrics = [h.string for h in headings_html]
print 'Found metrics %s' % ', '.join(metrics)
assert len(metrics) > 10

Found metrics gini, atk5, atk1, d9010, d9050, d8020, poorAll4, poorAll5, poorAll6, poorK4, poorK5, poorK6, poorE4, poorE5, poorE6, d5075, d75150, d150, poortp, poorsm, pkidsm, eymed, average


In [141]:
data = {}

# Multi-level dictionary insert, creating new sublevels as needed
# Performs dest[hierarchy[0]][hierarchy[1]]...[hierarchy[N]] = val

def insert(dest, hierarchy, val):
    key = hierarchy[0]
    hierarchy = hierarchy[1:]
    if hierarchy:
        if not key in dest:
            dest[key] = {}
        insert(dest[key], hierarchy, val)
    else:
        dest[key] = val

def html_to_number(html):
    if not html.string:
        return ''
    return float(html.string.strip())

for row in rows[1:]:
    labels_html = row.findAll('th')
    assert len(labels_html) == 1
    label = labels_html[0].string
    assert len(label) == 4
    country = label[0:2]
    yy = label[2:4]
    if yy < "30":
        yyyy = 2000 + int(yy)
    else:
        yyyy = 1900 + int(yy)
        
    cols_html = row.findAll('td')
    assert len(cols_html) == len(metrics)

    cols = [html_to_number(c) for c in cols_html]
        
    for (metric, col) in zip(metrics, cols):
        insert(data, [metric, country, yyyy], col)

In [143]:
descriptions = {}

meta_html = soup.findAll('p', {'class':'meta'})
assert len(meta_html) == 1
for (name, desc) in re.findall(r'\s\s\s(\w+)\s+-\W+(.*?)\W*<', str(meta_html[0])):
    descriptions[name] = re.sub(r'\W+', '_', desc)

descriptions

{'atk1': 'Atkinson_Coefficient_epsilon_1',
 'atk5': 'Atkinson_Coefficient_epsilon_0_5',
 'average': 'Mean_Equivalized_Income',
 'd150': 'Distribution_of_Children_by_Income_Group_above_150',
 'd5075': 'Distribution_of_Children_by_Income_Group_50_75',
 'd75150': 'Distribution_of_Children_by_Income_Group_75_150',
 'd8020': 'Percentile_Ratio_80_20',
 'd9010': 'Percentile_Ratio_90_10',
 'd9050': 'Percentile_Ratio_90_50',
 'eymed': 'Median_Equivalized_Income',
 'gini': 'Gini_Coefficient',
 'pkidsm': 'Children_Living_in_Single_Mother_Families',
 'poorAll4': 'Relative_Poverty_Rates_Total_Population_40',
 'poorAll5': 'Relative_Poverty_Rates_Total_Population_50',
 'poorAll6': 'Relative_Poverty_Rates_Total_Population_60',
 'poorE4': 'Relative_Poverty_Rates_Elderly_40',
 'poorE5': 'Relative_Poverty_Rates_Elderly_50',
 'poorE6': 'Relative_Poverty_Rates_Elderly_60',
 'poorK4': 'Relative_Poverty_Rates_Children_40',
 'poorK5': 'Relative_Poverty_Rates_Children_50',
 'poorK6': 'Relative_Poverty_Rates_Ch

In [144]:
def write_annual_csv(filename, table):
    # Find complete set of years
    year_set = set()
    all_datapoints = [] 
    for country in sorted(table.keys()):
        year_set |= set(table[country].keys())
    year_list = sorted(list(year_set))

    with open(dest, 'w') as out:
        writer = csv.writer(out)
        writer.writerow(['Country'] + year_list)
        for country in sorted(table.keys()):
            data = [table[country].get(year, '') for year in year_list]
            writer.writerow([country] + data)
            all_datapoints += data
    
    percent_defined = 100.0 * sum([d != '' for d in all_datapoints]) / len(all_datapoints)
    print ('Created %s with %d years x %d countries (%.1f%%)' % 
        (filename, len(year_list), len(table.keys()), percent_defined))

for metric in data.keys():
    dest = '%s/%s.csv' % (dir, descriptions[metric])
    write_annual_csv(dest, data[metric])

Created LuxembourgIncomeStudy/Distribution_of_Children_by_Income_Group_75_150.csv with 43 years x 47 countries (14.4%)
Created LuxembourgIncomeStudy/Children_Poverty_Rates_Single_Mother_Families_50.csv with 43 years x 47 countries (14.3%)
Created LuxembourgIncomeStudy/Median_Equivalized_Income.csv with 43 years x 47 countries (14.4%)
Created LuxembourgIncomeStudy/Gini_Coefficient.csv with 43 years x 47 countries (14.4%)
Created LuxembourgIncomeStudy/Children_Living_in_Single_Mother_Families.csv with 43 years x 47 countries (14.3%)
Created LuxembourgIncomeStudy/Distribution_of_Children_by_Income_Group_above_150.csv with 43 years x 47 countries (14.4%)
Created LuxembourgIncomeStudy/Relative_Poverty_Rates_Total_Population_60.csv with 43 years x 47 countries (14.4%)
Created LuxembourgIncomeStudy/Relative_Poverty_Rates_Total_Population_50.csv with 43 years x 47 countries (14.4%)
Created LuxembourgIncomeStudy/Relative_Poverty_Rates_Total_Population_40.csv with 43 years x 47 countries (14.4%)

In [148]:
!rsync -av LuxembourgIncomeStudy tm1:/usr4/web/data.cmucreatelab.org/www/earthtime

building file list ... done
LuxembourgIncomeStudy/
LuxembourgIncomeStudy/Atkinson_Coefficient_epsilon_0_5.csv
LuxembourgIncomeStudy/Atkinson_Coefficient_epsilon_1.csv
LuxembourgIncomeStudy/Children_Living_in_Single_Mother_Families.csv
LuxembourgIncomeStudy/Children_Poverty_Rates_Single_Mother_Families_50.csv
LuxembourgIncomeStudy/Children_Poverty_Rates_Two_Parent_Families_50.csv
LuxembourgIncomeStudy/Distribution_of_Children_by_Income_Group_50_75.csv
LuxembourgIncomeStudy/Distribution_of_Children_by_Income_Group_75_150.csv
LuxembourgIncomeStudy/Distribution_of_Children_by_Income_Group_above_150.csv
LuxembourgIncomeStudy/Gini_Coefficient.csv
LuxembourgIncomeStudy/Mean_Equivalized_Income.csv
LuxembourgIncomeStudy/Median_Equivalized_Income.csv
LuxembourgIncomeStudy/Percentile_Ratio_80_20.csv
LuxembourgIncomeStudy/Percentile_Ratio_90_10.csv
LuxembourgIncomeStudy/Percentile_Ratio_90_50.csv
LuxembourgIncomeStudy/README.txt
LuxembourgIncomeStudy/Relative_Poverty_Rates_Children_40.csv
Luxembou