In [None]:
# imports
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn import preprocessing
#import seaborn as sns

In [None]:
# process 2018 tax help data
ato2016_data = pd.read_excel("atoabsgovhack2018.xlsx", sheetname="ATO Data")
abs2016_data = pd.read_excel("atoabsgovhack2018.xlsx", sheetname="ABS Data")
txc_data = pd.read_excel("atoabsgovhack2018.xlsx", sheetname="Tax Help Center")
seifa_data = pd.read_excel("atoabsgovhack2018.xlsx", sheetname="ABS SEIFA ")

In [None]:
# clean 2018 tax help data
txc_data.rename(columns={'Post Code': 'Postcode'}, inplace=True)
seifa_data.rename(columns={'Postal Area (POA) Code': 'Postcode'}, inplace=True)
seifa_data.rename(columns={'Year': 'Income year'}, inplace=True)
seifa_data["Income year"] = seifa_data["Income year"].apply(lambda x: 2015 if x == 2011 else x)

In [None]:
# process 2017 tax help data
ato2015_data = pd.read_excel("atoabsgovhack2017.xlsx", sheetname="Data", skiprows=0, usecols=[0,1,2,*range(3, 17)])
abs2015_data = pd.read_excel("atoabsgovhack2017.xlsx", sheetname="Data", skiprows=0, usecols=[0,1,2,*range(17, 56)])

In [None]:
# clean 2017 tax help data
ato2015_data = ato2015_data.loc[ato2015_data['Income year'] == 2015]
abs2015_data = abs2015_data.loc[abs2015_data['Income year'] == 2015]

In [None]:
# process 2016 ato stats
df = pd.read_excel("taxstats2016individual06taxablestatusstateterritorypostcodetaxableincome.xlsx", sheetname="Individuals Table 6B", skiprows=2, usecols=[1, 2, 4, 37, 39, 85, 93, 107, 129])
ato2016_stats = pd.DataFrame()
ato2016_stats['average income per person'] = df[df.columns[2]]/df[df.columns[1]]
ato2016_stats['unfranked ratio'] = df[df.columns[3]]/df[df.columns[1]]
ato2016_stats['franked ratio'] = df[df.columns[4]]/df[df.columns[1]]
ato2016_stats['cgt ratio'] = df[df.columns[5]]/df[df.columns[1]]
ato2016_stats['foreign income ratio'] = df[df.columns[6]]/df[df.columns[1]]
ato2016_stats['rent ratio'] = df[df.columns[7]]/df[df.columns[1]]
ato2016_stats['business ratio'] = df[df.columns[8]]/df[df.columns[1]]

x = ato2016_stats.values
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
ato2016_stats_norm = pd.DataFrame(x_scaled, columns=ato2016_stats.columns)

ato2016_stats_norm['average'] = ato2016_stats_norm.mean(axis=1)
ato2016_stats_norm["average"] = ato2016_stats_norm["average"].apply(lambda x: 1 - x)
ato2016_stats_norm['Postcode'] = df['Postcode']
ato2016_stats_norm = ato2016_stats_norm.loc[ato2016_stats_norm['Postcode'].isin(list(range(100,9999)))]
ato2016_stats_norm['Postcode'] = ato2016_stats_norm['Postcode'].astype(np.int64)
ato2016_stats_norm['Income year'] = 2016

In [None]:
# process 2015 ato stats
df = pd.read_excel("taxstats2015individual06taxablestatusstateterritorypostcode.xlsx", sheetname="Individuals Table 6B", skiprows=2, usecols=[1, 2, 4, 37, 39, 79, 87, 101, 123])
ato2015_stats = pd.DataFrame()
ato2015_stats['average income per person'] = df[df.columns[2]]/df[df.columns[1]]
ato2015_stats['unfranked ratio'] = df[df.columns[3]]/df[df.columns[1]]
ato2015_stats['franked ratio'] = df[df.columns[4]]/df[df.columns[1]]
ato2015_stats['cgt ratio'] = df[df.columns[5]]/df[df.columns[1]]
ato2015_stats['foreign income ratio'] = df[df.columns[6]]/df[df.columns[1]]
ato2015_stats['rent ratio'] = df[df.columns[7]]/df[df.columns[1]]
ato2015_stats['business ratio'] = df[df.columns[8]]/df[df.columns[1]]

x = ato2015_stats.values
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
ato2015_stats_norm = pd.DataFrame(x_scaled, columns=ato2015_stats.columns)

ato2015_stats_norm['average'] = ato2015_stats_norm.mean(axis=1)
ato2015_stats_norm["average"] = ato2015_stats_norm["average"].apply(lambda x: 1 - x)
ato2015_stats_norm['Postcode'] = df['Postcode']
ato2015_stats_norm = ato2015_stats_norm.loc[ato2015_stats_norm['Postcode'].isin(list(range(100,9999)))]
ato2015_stats_norm['Postcode'] = ato2015_stats_norm['Postcode'].astype(np.int64)
ato2015_stats_norm['Income year'] = 2015

In [None]:
# join datasets
df = pd.DataFrame()
df = df.append(ato2016_data)
df = df.append(ato2015_data)
abs_data = abs2016_data.append(abs2015_data)
df = df.merge(abs_data, on=["Income year", "Postcode"], how="outer")
ato_stats = ato2016_stats_norm.append(ato2015_stats_norm)
df = df.merge(ato_stats, on=["Income year", "Postcode"], how="outer")
df = df.merge(seifa_data, on=["Income year", "Postcode"], how="outer")
df = df.merge(txc_data, on="Postcode", how="outer")
df.fillna(0, inplace=True)
# TEMPORARY: limit to 2015/2016 data
df = df[df['Income year'].isin([2016, 2015])]

df['average_bucket'] = pd.cut(df['average'].values, bins=len(df['Count'].unique()), labels = list(range(0,len(df['Count'].unique()))))
count_bucket_dict = {v: k for k, v in dict(enumerate(sorted(df['Count'].unique()))).items()}
df['count_bucket'] = df["Count"].apply(lambda x: count_bucket_dict[x])

df["score"] = df['average_bucket'] == df['count_bucket']
df["score"] = df["score"].apply(lambda x: 1 if x else 0)

df.head()

In [None]:
df["score"].value_counts()

In [None]:
# specify features columns
features = df.columns[3:-3]
df[features].columns

# feature correlations
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    count_corr = df[['Count'] + list(features)].corr(method='pearson')['Count']
    display(count_corr[count_corr > 0.5])

In [None]:
# create label column and train/test split
df['label'] = df['Count']
# df['label'] = (df['Count'] > 1).astype(np.int64)
df['is_train'] = np.random.uniform(0, 1, len(df)) <= .80
train, test = df[df['is_train']==True], df[df['is_train']==False]
print("Train Class Balance:", train[train['label']==0].shape[0], " / ", train[train['label']==1].shape[0])
print("Test Class Balance:", test[test['label']==0].shape[0], " / ", test[test['label']==1].shape[0])
train.head()

In [None]:
# build predictive model
rf = RandomForestClassifier(class_weight='balanced', n_estimators=1000, oob_score=True)
rf.fit(train[features], train['label'])

In [None]:
# model accuracy (out of bag and test set)
accuracy = accuracy_score(test['label'], rf.predict(test[features]))
print(f'Out-of-bag score estimate: {rf.oob_score_:.3}')
print(f'Mean accuracy score: {accuracy:.3}')

from sklearn.model_selection import cross_val_score

scores = cross_val_score(rf, train[features], train['label'], cv=5)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

In [None]:
# confusion matrix for test set
cm = pd.DataFrame(confusion_matrix(test['label'], rf.predict(test[features])), columns=test['label'].unique(), index=test['label'].unique())
sns.heatmap(cm, annot=True)

In [None]:
# top ten features
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(train[features], list(rf.feature_importances_))]
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)
[print('Variable: {:50} Importance: {}'.format(*pair)) for pair in feature_importances[:10]];