In [1]:
# imports
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.wrappers.scikit_learn import KerasRegressor
from sklearn.model_selection import cross_val_score, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

  from ._conv import register_converters as _register_converters


In [2]:
# process 2018 tax help data
ato2016_data = pd.read_excel("atoabsgovhack2018.xlsx", sheet_name="ATO Data")
abs2016_data = pd.read_excel("atoabsgovhack2018.xlsx", sheet_name="ABS Data")
txc_data = pd.read_excel("atoabsgovhack2018.xlsx", sheet_name="Tax Help Center")
txc_data.rename(columns={'Post Code': 'Postcode'}, inplace=True)
seifa_data = pd.read_excel("atoabsgovhack2018.xlsx", sheet_name="ABS SEIFA ").convert_objects(convert_numeric=True)
seifa_data.rename(columns={'Postal Area (POA) Code': 'Postcode'}, inplace=True)
seifa_data.rename(columns={'Year': 'Income year'}, inplace=True)
seifa_data["Income year"] = seifa_data["Income year"].apply(lambda x: 2015 if x == 2011 else x)

# process 2017 tax help data
ato2015_data = pd.read_excel("atoabsgovhack2017.xlsx", sheet_name="Data", skiprows=0, usecols=[0,1,2,*range(3, 17)])
ato2015_data = ato2015_data.loc[ato2015_data['Income year'] == 2015]
abs2015_data = pd.read_excel("atoabsgovhack2017.xlsx", sheet_name="Data", skiprows=0, usecols=[0,1,2,*range(17, 56)])
abs2015_data = abs2015_data.loc[abs2015_data['Income year'] == 2015]

For all other conversions use the data-type specific converters pd.to_datetime, pd.to_timedelta and pd.to_numeric.
  


In [3]:
# process 2016 ato stats
df = pd.read_excel("taxstats2016individual06taxablestatusstateterritorypostcodetaxableincome.xlsx", sheet_name="Individuals Table 6B", skiprows=2, usecols=[1, 2, 4, 37, 39, 85, 93, 107, 129])
ato2016_stats = pd.DataFrame()
ato2016_stats['average income per person'] = df[df.columns[2]]/df[df.columns[1]]
ato2016_stats['unfranked ratio'] = df[df.columns[3]]/df[df.columns[1]]
ato2016_stats['franked ratio'] = df[df.columns[4]]/df[df.columns[1]]
ato2016_stats['cgt ratio'] = df[df.columns[5]]/df[df.columns[1]]
ato2016_stats['foreign income ratio'] = df[df.columns[6]]/df[df.columns[1]]
ato2016_stats['rent ratio'] = df[df.columns[7]]/df[df.columns[1]]
ato2016_stats['business ratio'] = df[df.columns[8]]/df[df.columns[1]]
ato2016_stats['Postcode'] = df['Postcode']
ato2016_stats = ato2016_stats.loc[ato2016_stats['Postcode'].isin(list(range(100,9999)))]
ato2016_stats['Postcode'] = ato2016_stats['Postcode'].astype(np.int64)
ato2016_stats['Income year'] = 2016

# process 2015 ato stats
df = pd.read_excel("taxstats2015individual06taxablestatusstateterritorypostcode.xlsx", sheet_name="Individuals Table 6B", skiprows=2, usecols=[1, 2, 4, 37, 39, 79, 87, 101, 123])
ato2015_stats = pd.DataFrame()
ato2015_stats['average income per person'] = df[df.columns[2]]/df[df.columns[1]]
ato2015_stats['unfranked ratio'] = df[df.columns[3]]/df[df.columns[1]]
ato2015_stats['franked ratio'] = df[df.columns[4]]/df[df.columns[1]]
ato2015_stats['cgt ratio'] = df[df.columns[5]]/df[df.columns[1]]
ato2015_stats['foreign income ratio'] = df[df.columns[6]]/df[df.columns[1]]
ato2015_stats['rent ratio'] = df[df.columns[7]]/df[df.columns[1]]
ato2015_stats['business ratio'] = df[df.columns[8]]/df[df.columns[1]]
ato2015_stats['Postcode'] = df['Postcode']
ato2015_stats = ato2015_stats.loc[ato2015_stats['Postcode'].isin(list(range(100,9999)))]
ato2015_stats['Postcode'] = ato2015_stats['Postcode'].astype(np.int64)
ato2015_stats['Income year'] = 2015

In [4]:
# join datasets
df = pd.DataFrame()
df = df.append(ato2016_data)
df = df.append(ato2015_data)
abs_data = abs2016_data.append(abs2015_data)
df = df.merge(abs_data, on=["Income year", "Postcode"], how="outer")
ato_stats = ato2016_stats.append(ato2015_stats)
df = df.merge(ato_stats, on=["Income year", "Postcode"], how="outer")
df = df.merge(seifa_data, on=["Income year", "Postcode"], how="outer")
df = df.merge(txc_data, on="Postcode", how="outer")
df.fillna(0, inplace=True)
# TEMPORARY: limit to 2015/2016 data
df = df[df['Income year'].isin([2016, 2015])]
df.head()

Unnamed: 0,id_x,Income year,Postcode,Individuals1,Taxable income or loss1,Net tax,Gross interest,Net rent,Net capital gain,Total income or loss,...,cgt ratio,foreign income ratio,rent ratio,business ratio,Index of Relative Socio-economic Advantage and Disadvantage,Index of Relative Socio-economic Disadvantage,Index of Economic Resources,Index of Education and Occupation,Usual Resident Population,Count
2,201600800.0,2016.0,800,5464.0,389375600.0,101020407.0,2146701.0,-4580471.0,2352866.0,401984100.0,...,0.022328,0.026537,0.037518,0.042277,1096.0,1066.0,946.0,1089.0,6464.0,0.0
3,201500800.0,2015.0,800,5579.0,345853400.0,83997228.0,2280912.0,-4125084.0,3027094.0,358517700.0,...,0.022585,0.023839,0.037103,0.044452,1072.0,1060.0,952.0,1077.0,4564.0,0.0
6,201600810.0,2016.0,810,21128.0,1367380000.0,315901076.0,9757680.0,-17414593.0,8046750.0,1423396000.0,...,0.028446,0.037344,0.061246,0.084154,1052.0,1037.0,1014.0,1045.0,33302.0,1.0
7,201500810.0,2015.0,810,20792.0,1326501000.0,302144669.0,11336449.0,-14253016.0,10798280.0,1381329000.0,...,0.029434,0.035879,0.062716,0.086235,1037.0,1027.0,1008.0,1051.0,29725.0,1.0
10,201600812.0,2016.0,812,11509.0,728206300.0,162280073.0,4232709.0,-10799838.0,2549416.0,755230100.0,...,0.026327,0.034669,0.057694,0.073073,1020.0,1013.0,1013.0,997.0,18873.0,1.0


In [5]:
# specify features columns
features = df.columns[3:-3]
df[features].columns

# feature correlations
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    count_corr = df[['Count'] + list(features)].corr(method='pearson')['Count']
    display(count_corr[count_corr > 0.5])

Count          1.00000
Not married    0.50443
Name: Count, dtype: float64

In [6]:
# create label column and train/test split
df['label'] = df['Count']
# df['label'] = (df['Count'] > 1).astype(np.int64)
df['is_train'] = np.random.uniform(0, 1, len(df)) <= .80
train, test = df[df['is_train']==True], df[df['is_train']==False]
print("Train Class Balance:", train[train['label']==0].shape[0], " / ", train[train['label']==1].shape[0])
print("Test Class Balance:", test[test['label']==0].shape[0], " / ", test[test['label']==1].shape[0])
train.head()

Train Class Balance: 3481  /  632
Test Class Balance: 802  /  170


Unnamed: 0,id_x,Income year,Postcode,Individuals1,Taxable income or loss1,Net tax,Gross interest,Net rent,Net capital gain,Total income or loss,...,rent ratio,business ratio,Index of Relative Socio-economic Advantage and Disadvantage,Index of Relative Socio-economic Disadvantage,Index of Economic Resources,Index of Education and Occupation,Usual Resident Population,Count,label,is_train
2,201600800.0,2016.0,800,5464.0,389375600.0,101020407.0,2146701.0,-4580471.0,2352866.0,401984100.0,...,0.037518,0.042277,1096.0,1066.0,946.0,1089.0,6464.0,0.0,0.0,True
3,201500800.0,2015.0,800,5579.0,345853400.0,83997228.0,2280912.0,-4125084.0,3027094.0,358517700.0,...,0.037103,0.044452,1072.0,1060.0,952.0,1077.0,4564.0,0.0,0.0,True
6,201600810.0,2016.0,810,21128.0,1367380000.0,315901076.0,9757680.0,-17414593.0,8046750.0,1423396000.0,...,0.061246,0.084154,1052.0,1037.0,1014.0,1045.0,33302.0,1.0,1.0,True
7,201500810.0,2015.0,810,20792.0,1326501000.0,302144669.0,11336449.0,-14253016.0,10798280.0,1381329000.0,...,0.062716,0.086235,1037.0,1027.0,1008.0,1051.0,29725.0,1.0,1.0,True
10,201600812.0,2016.0,812,11509.0,728206300.0,162280073.0,4232709.0,-10799838.0,2549416.0,755230100.0,...,0.057694,0.073073,1020.0,1013.0,1013.0,997.0,18873.0,1.0,1.0,True


In [7]:
# build predictive model
def baseline_model():
	model = Sequential()
	model.add(Dense(64, input_dim=64, kernel_initializer='normal', activation='relu'))
	model.add(Dense(1, kernel_initializer='normal'))
	model.compile(loss='mean_squared_error', optimizer='adam')
	return model

estimator = KerasRegressor(build_fn=baseline_model, epochs=100, batch_size=5, verbose=0)

In [8]:
# baseline model accuracy (cross validation)
kfold = KFold(n_splits=10)
scores = cross_val_score(estimator, train[features], train['label'], cv=kfold)
print("Baseline Accuracy: %0.2f (+/- %0.2f) MSE" % (scores.mean(), scores.std() * 2))

Baseline Accuracy: nan (+/- nan) MSE


In [9]:
# standardise data
estimators = []
estimators.append(('standardize', StandardScaler()))
estimators.append(('mlp', estimator))
pipeline = Pipeline(estimators)

# standardized model accuracy (cross validation)
kfold = KFold(n_splits=10)
scores = cross_val_score(pipeline, train[features], train['label'], cv=kfold)
print("Standardized Accuracy: %0.2f (+/- %0.2f) MSE" % (scores.mean(), scores.std() * 2))

Standardized Accuracy: -2.56 (+/- 10.78) MSE
