In [1]:
import requests
import pandas as pd
from bs4 import BeautifulSoup as bs
import pickle
import matplotlib.pyplot as pl
import re
import os
from tqdm import tqdm
import json

In [3]:
def convert(val):
    """Convert a value to numeric if it is."""
    val = val.replace(',', '')
    val = float(val) if val.isdigit() else val
    return val

def get_values_2006(table):
    """Parse table for 2006 html page."""
    # find values
    values = []
    for row in table.find_all('tr')[1:]:
        if row.find('td') is None:
            continue
        record = [row.find_all('td')[0].text]
        records = record + [convert(x.text) for x in row.find_all('td')[1:]]
        if len(records) == 1:
            continue
        values.append(records)
    return values

def get_values_2016(table):
    """Parse table for 2016 html page."""
    # find values
    values = []
    for row in table.find_all('tr')[1:]:
        if row.find('td') is None:
            continue
        record = [row.find_all('th')[0].text]
        records = record + [convert(x.text) for x in row.find_all('td')]
        values.append(records)
    return values

def parse_common_table(table, year):
    """Parse table that has common row names across different suburbs."""
    # find column names
    row = table.find_all('tr')[0]
    columns = [x.text for x in row.find_all('th')]
    for i, x in enumerate(columns):
        if x == '%':
            columns[i] = columns[i - 1] + " (%)"
    
    if year != '2016':
        values = get_values_2006(table)
    else:
        values = get_values_2016(table)
    
    data = pd.DataFrame(values, columns=columns).T
    data.columns = data.iloc[0]
    data = data.iloc[1:]

    # create double columns for percentage
    columns = list(data.columns)
    newcolumns = [x + ' (%)' for x in columns]
    for x in newcolumns:
        data[x] = None
        
    # find percentage rows and fill in value
    index = [x for x in data.index if '%' in x]
    for i in index:
        record = data.loc[i]
        rowname = ' '.join(i.split(' ')[:-1])
        for x in columns:
            data.at[rowname, x + ' (%)'] =  record[x]
    
    # drop those rows
    data = data.drop(index)
    return data

In [4]:
common_map = {}
uncommon_map = {}

for year in ['2001', '2006', '2011', '2016']:
    with open('source/source{}.pkl'.format(year), 'rb') as f:
        d = pickle.load(f)
    soup = bs(d['Burwood'], 'html')
    tables = soup.find_all('table')

    cmap = {}
    ucmap = {}
    for i, tb in enumerate(tables):
        try:
            z = re.search(r'(\<\!--).+(-->)', str(tb))
            name = z.group().replace('<!-- ', '').replace('-->', '').strip()
        except AttributeError:
            name = tb.text.strip().split('\n')[0]
        try:
            df = parse_common_table(tb, year)
            if len(df.columns) <= 1:
                ucmap[i] = name
            else:
                cmap[i] = name
        except:
            cmap[i] = name
    # append to overall mapping
    common_map[year] = cmap
    uncommon_map[year] = ucmap

In [5]:
non2016 = [0, 1, 2, 3, 4]
yes2016 = [0, 1, 2, 3]
for year in common_map:
    exclusion = yes2016 if year == '2016' else non2016
    common_map[year] = {k: v for k, v in common_map[year].items() if k not in exclusion and v != ''}
    
# country of birth should be taken out
for year in common_map:
    c = common_map[year]
    topop = None
    for k, v in c.items():
        if v == 'Country of birth':
            uncommon_map[year][k] = v
            topop = k
    common_map[year].pop(k)
uncommon_map

{'2001': {9: 'Religious affiliation, top responses',
  10: 'Language, top responses (other than English)',
  12: 'Occupation',
  13: 'Industry of employment, top responses',
  8: 'Country of birth'},
 '2006': {9: 'Religious affiliation, top responses',
  10: 'Language, top responses (other than English)',
  12: 'Occupation',
  13: 'Industry of employment, top responses',
  21: 'Household composition',
  8: 'Country of birth'},
 '2011': {11: 'Ancestry, top responses',
  14: 'Religious affiliation, top responses',
  15: 'Language, top responses (other than English)',
  19: 'Occupation',
  20: 'Industry of employment, top responses',
  22: 'Travel to work, top responses',
  12: 'Country of birth'},
 '2016': {9: 'Ancestry, top responses',
  12: 'Country of birth of father',
  13: 'Country of birth of mother',
  14: 'Religious affiliation',
  15: 'Language, top responses5',
  18: 'Occupation',
  19: 'Industry of employment',
  21: 'Travel to work',
  10: 'Country of birth'}}

In [29]:
year = '2016'
with open('source/source{}.pkl'.format(year), 'rb') as f:
    source = pickle.load(f)
uncommon_map
uncommon_map[year]
index = 6
tablename = list(uncommon_map[year].values())[index]
suburb = 'Burwood'

soup = bs(source[suburb], 'html')
tables = soup.find_all('table')

tables[index]

# find javascript
scripts = soup.find_all('script')
javascript = [x for x in scripts if 'src' not in x.attrs and 'type' in x.attrs
              and x.attrs['type'] == 'text/javascript'][0]
tablename

# uncommon_map

'Industry of employment'

In [30]:
table = tables[index]
# find column names

tables[index]

# find javascript
scripts = soup.find_all('script')
javascript = [x for x in scripts if 'src' not in x.attrs and 'type' in x.attrs
              and x.attrs['type'] == 'text/javascript'][0]

row = table.find_all('tr')[0]
columns = [x.text for x in row.find_all('th')]
for i, x in enumerate(columns):
    if x == '%':
        columns[i] = columns[i - 1] + " (%)"
        
columns

['Social marital statusPeople aged 15 years and over',
 'Burwood (NSW)',
 'Burwood (NSW) (%)',
 'New South Wales',
 'New South Wales (%)',
 'Australia',
 'Australia (%)']

In [6]:
pattern_mapping = {}

p2016 = {}
for _, k in uncommon_map['2016'].items():
    newk = k.split(',')[0]
    newk = newk.replace('birth of', '')
    newk = newk.replace(' of employment', '')
    newk = newk.split(' ')
    if len(newk) > 1:
        newk[1:] = [x.capitalize() for x in newk[1:]]
    newk[0] = newk[0].lower()
    newk = ''.join(newk)
    p2016[k] = 'var {}Data'.format(newk)

pattern_mapping['2016'] = p2016

In [133]:
i = 0
var = list(p2016.values())[i]
original_name = list(uncommon_map['2016'].values())[i]
get_vardata(var, javascript, original_name, 'Burwood')

Unnamed: 0,"Ancestry, top responses",Burwood (%),New South Wales (%),Australia (%),Burwood,New South Wales,Australia
0,Chinese,45.1,5.2,3.9,8096,514594,1213903
0,English,7.0,23.3,25.0,1248,2302481,7852224
0,Australian,5.3,22.9,23.3,944,2261062,7298243
0,Indian,3.9,2.1,2.0,694,211927,619164
0,Korean,3.7,0.7,0.4,667,66613,123017


# Parse Javascript Tables

In [10]:
year = '2016'
dataroot = 'census_data'

with open('source/source{}.pkl'.format(year), 'rb') as f:
    source = pickle.load(f)

# iterate all common tables
for index, tablename in uncommon_map[year].items():
    name = '{}_{}.csv'.format(year, tablename).replace(' ', '_').replace(',', '')
    name = name.replace('/', '_')
    path = os.path.join(dataroot, name)
    print('Year={} Table={}'.format(year, tablename))
    if os.path.exists(path):
        continue
    group = []
    for i, (suburb, txt) in enumerate(source.items()):
        if i % 100 == 0:
            print('    Iteration i={}'.format(i))
        soup = bs(txt, 'html')
        try:
            var = pattern_mapping[year][tablename]
            sdata = parse_uncommon_table(soup, var, tablename, suburb)
            group.append(sdata)
        except IndexError:
            print('  Error with {}, index={}'.format(suburb, index))
            continue
    data = parse_uncommon_allsuburbs(group)
    print('Saving to', path)
    data.to_csv(path)

Year=2016 Table=Ancestry, top responses
    Iteration i=0
    Iteration i=100
    Iteration i=200
  Error with Dural, index=9
    Iteration i=300
  Error with Hillsborough, index=9
  Error with Long Point, index=9
    Iteration i=400
  Error with Maryland, index=9
  Error with Mayfield, index=9
    Iteration i=500
  Error with Punchbowl, index=9
    Iteration i=600
  Error with Springfield, index=9
  Error with St Clair, index=9
    Iteration i=700
Saving to census_data/2016_Ancestry_top_responses.csv
Year=2016 Table=Country of birth of father
    Iteration i=0
    Iteration i=100
    Iteration i=200
  Error with Dural, index=12
    Iteration i=300
  Error with Hillsborough, index=12
  Error with Long Point, index=12
    Iteration i=400
  Error with Maryland, index=12
  Error with Mayfield, index=12
    Iteration i=500
  Error with Punchbowl, index=12
    Iteration i=600
  Error with Springfield, index=12
  Error with St Clair, index=12
    Iteration i=700
Saving to census_data/2016_Co

In [8]:
def parse_uncommon_allsuburbs(group):
    """Parse uncommon table of all suburbs in group."""
    # drop NSW and Australia in 1: tables
    for i, df in enumerate(group):
        if i > 0:
            df = df.drop(['New South Wales (%)', 'Australia (%)', 'New South Wales', 'Australia'], axis=1)
            group[i] = df
    # outer join all dataframes
    key = group[0].columns[0]
    data = pd.merge(group[0], group[1], how='outer', on=key)
    for i in range(2, len(group)):
        data = pd.merge(data, group[i], how='outer', on=key)
    
    # transpose and set column names
    data = data.T
    data.columns = data.iloc[0]
    data = data.iloc[1:]

    # create double columns for percentage
    columns = list(data.columns)
    newcolumns = [x + ' (%)' for x in columns]
    for x in newcolumns:
        data[x] = None

    # find percentage rows and fill in value
    index = [x for x in data.index if '%' in x]
    for i in index:
        record = data.loc[i]
        rowname = ' '.join(i.split(' ')[:-1])
        for x in columns:
            data.at[rowname, x + ' (%)'] =  record[x]

    # drop those rows
    data = data.drop(index)
    return data

In [73]:
chinese = data[~pd.isnull(data['Chinese'])][['Chinese', 'Chinese (%)']]
chinese['Chinese (%)'] = chinese['Chinese (%)'].astype(float)
chinese.sort_values('Chinese (%)', ascending=False).head(50)

"Ancestry, top responses",Chinese,Chinese (%)
Hurstville,16403,49.4
Burwood,8096,45.1
Rhodes,5848,44.5
Eastwood,8071,38.4
Ultimo,3709,36.6
East Killara,1286,35.9
Chatswood,10102,34.1
Zetland,4116,33.8
Chippendale,3406,33.1
Carlingford,9302,32.0


In [9]:
toaddquote = ['categoryField', 'areaPercent', 'statePercent', 'australiaPercent',
              'areaValue', 'stateValue', 'australiaValue']
toremove = ['QuickStats.formatValue(', ')']

def parse_uncommon_table(soup, var, original_name, suburb):
    """Get JS variable data from HTML."""
    # find javascript
    scripts = soup.find_all('script')
    js = [x for x in scripts if 'src' not in x.attrs and 'type' in x.attrs and x.attrs['type'] == 'text/javascript'][0]
    # pattern name
    pattern = '(' + var + ' = \[)[^\]]*(\];)'
    toreplace = var + ' = ['
    m = re.search(pattern, str(js))
    j = m.group().replace(toreplace, '').replace('];', '').strip()
    j = j.replace('\n', '').replace('\t', '')

    for val in toaddquote:
        j = j.replace(val, '"{}"'.format(val))

    for val in toremove:
        j = j.replace(val, '')

    values = j.split('},')
    values = [x + '}' for x in values]
    values = [x.replace('}}', '}') for x in values]

    records = []
    for x in values:
        dct = json.loads(x)
        dct = {k: [v] for k, v in dct.items()}
        subrecord = pd.DataFrame().from_dict(dct)
        records.append(subrecord)
    df = pd.concat(records, axis=0)
    df.columns = [original_name, suburb + ' (%)', 'New South Wales (%)', 'Australia (%)',
                  suburb, 'New South Wales', 'Australia']
    return df

It is consistent that we need to parse table 4 to table 39 for all 4 years.

### Parse common tables

In [None]:
commontables = {}

errortables = None
dataroot = 'census_data'
if not os.path.exists(dataroot):
    os.mkdir(dataroot)
    
for year in ['2001', '2006', '2011', '2016']: 
    # open pickle file
    with open('source{}.pkl'.format(year), 'rb') as f:
        source = pickle.load(f)
    
    # iterate all common tables
    for index, tablename in common_map[year].items():
        path = os.path.join(dataroot, '{}_{}.csv'.format(year, tablename).replace(' ', '_'))
        print('Year={} Table={}'.format(year, tablename))
        if os.path.exists(path):
            continue
        group = []
        for i, (suburb, txt) in enumerate(source.items()):
            if i % 100 == 0:
                print('    Iteration i={}'.format(i))
            soup = bs(txt, 'html')
            # find all tables
            tables = soup.find_all('table')
            try:
                sdata = parse_common_table(tables[index], year)
                group.append(sdata)
            except IndexError:
                print('  Error with {}, index={}'.format(suburb, index))
                errortables = tables
                break
            
        df = pd.concat(group, axis=0).drop_duplicates()
        print('Saving to', path)
        df.to_csv(path)

Family composition,Couple family without children,Couple family with children,One parent family,Other family,Total families,Couple family without children (%),Couple family with children (%),One parent family (%),Other family (%),Total families (%)
Abbotsbury,200.0,774.0,127,7,1108.0,18.1,69.9,11.5,0.6,--
Australia,1943640.0,2362580.0,823254,89686,5219160.0,37.2,45.3,15.8,1.7,--


In [230]:
ages = tables[4]
# tables[4]

table = tables[4]

# find column names
row = table.find_all('tr')[0]
columns = [x.text for x in row.find_all('th')]
for i, x in enumerate(columns):
    if x == '%':
        columns[i] = columns[i - 1] + " (%)"
columns

values = get_values_2006(table)

values

data = pd.DataFrame(values, columns=columns).T
data.columns = data.iloc[0]
data = data.iloc[1:]

data

Median weekly incomes,Personal,Family,Household,Personal (%),Family (%),Household (%)
Burwood,370,1115,1005,--,--,--
Australia,466,1171,1027,--,--,--


In [126]:
scripts = soup.find_all('script')
scripts[9].attrs

{'type': 'text/javascript'}

In [231]:
javascript = [x for x in scripts if 'src' not in x.attrs and 'type' in x.attrs
              and x.attrs['type'] == 'text/javascript'][0]

# javascript

In [88]:
re.compile("var ancestryData = \[\];").findall(javascript.text)

[]

In [28]:
float(people.find_all('td')[1].text.replace(',', ''))

21260.0