In [None]:
import pandas as pd
import numpy as np

# Group based analysis
Where the ability to apply both built in and your own functions to data tables is particularly powerful, is when we group data into categories. Geographical data are a good place to apply this because they are often organised hierarchically (meshblocks into SA1s into SA2s and so on). It is worth keeping in mind that if you have labels for the layers in a geographically hierarchy it is almost certainly quicker to aggregate data using those than by spatial joins.

To explore this we go back to the source SA1 data for all of New Zealand, and apply some of the steps from the previous notebook on data cleaning.

In [14]:
# make an SA1 to UR lookup
urban_areas = pd.read_csv("geographic-areas-table-2023.csv")[
    ["SA12023_code", "UR2023_name"]] \
    .drop_duplicates() \
    .set_index("SA12023_code")

# get the data (all 500+ columns), set SA1 as an index, and flag NAs
sa1 = pd.read_csv("2023_Census_totals_by_topic_for_individuals_by_SA1.csv") \
    .rename(columns = {"Statistical area 1 (SA1) 2023 code": "sa1_code"}) \
    .set_index("sa1_code") \
    .replace([-999, -997], pd.NA)

# drop non Mainland    
sa1 = sa1[sa1["Landwater name"] == "Mainland"] \
    .drop(columns = ["OBJECTID", "Landwater code", "Landwater name"])

# make the dataframe
sa1 = urban_areas.join(sa1, how = "inner")
sa1.index.name = "sa1_code"
sa1

Unnamed: 0_level_0,UR2023_name,"Subject pop: Census usually resident population count, Year: 2013, Measure: Count, Var1: Census usually resident population count (Total)","Subject pop: Census usually resident population count, Year: 2018, Measure: Count, Var1: Census usually resident population count (Total)","Subject pop: Census usually resident population count, Year: 2023, Measure: Count, Var1: Census usually resident population count (Total)","Subject pop: Census night population count, Year: 2013, Measure: Count, Var1: Census night population count (Total)","Subject pop: Census night population count, Year: 2018, Measure: Count, Var1: Census night population count (Total)","Subject pop: Census night population count, Year: 2023, Measure: Count, Var1: Census night population count (Total)","Subject pop: Census usually resident population count, Year: 2013, Measure: Count, Var1: Age (5-year groups - 0-4 years)","Subject pop: Census usually resident population count, Year: 2013, Measure: Count, Var1: Age (5-year groups - 5-9 years)","Subject pop: Census usually resident population count, Year: 2013, Measure: Count, Var1: Age (5-year groups - 10-14 years)",...,"Subject pop: Census usually resident population count aged 5 years and over, Year: 2023, Measure: Count, Var1: Difficulty washing (Some difficulty)","Subject pop: Census usually resident population count aged 5 years and over, Year: 2023, Measure: Count, Var1: Difficulty washing (A lot of difficulty)","Subject pop: Census usually resident population count aged 5 years and over, Year: 2023, Measure: Count, Var1: Difficulty washing (Cannot do at all)","Subject pop: Census usually resident population count aged 5 years and over, Year: 2023, Measure: Count, Var1: Difficulty washing (Not elsewhere included)","Subject pop: Census usually resident population count aged 5 years and over, Year: 2023, Measure: Count, Var1: Difficulty washing (Total)","Subject pop: Census usually resident population count aged 5 years and over, Year: 2023, Measure: Count, Var1: Difficulty washing (Total stated)",Area square kilometres,Land area square kilometres,Shape__Area,Shape__Length
sa1_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
7029921,Other rural Far North District,168,183,177,174,183,177,15,15,18,...,6,0,3,48,168,120,218.041023,218.041023,2.180409e+08,128776.523150
7000004,Other rural Far North District,102,138,156,105,129,165,9,6,3,...,6,3,0,42,147,108,168.741273,168.741273,1.687406e+08,74357.812918
7000019,Other rural Far North District,228,243,285,225,255,291,18,18,18,...,9,3,0,72,264,192,124.516389,124.516389,1.245162e+08,67877.430558
7000023,Kaimaumau,135,153,204,129,144,201,15,9,9,...,12,3,0,63,189,126,37.086499,37.086499,3.708609e+07,32229.272649
7000021,Other rural Far North District,186,204,207,186,198,207,15,18,27,...,12,0,0,63,201,138,17.260724,17.260724,1.726090e+07,18504.382147
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7032140,Auckland,9,6,12,9,9,9,,,,...,0,0,0,3,12,9,0.080066,0.080066,8.012290e+04,1477.956816
7034861,Wellington,0,3,3,0,3,3,,,,...,,,,,3,,0.020728,0.020728,2.071054e+04,724.589116
7031627,Auckland,0,0,0,0,0,0,,,,...,,,,,0,,0.149061,0.149061,1.492076e+05,3861.152148
7031179,Auckland,0,0,0,0,0,0,,,,...,,,,,0,,0.021934,0.021934,2.205937e+04,838.101633


In [15]:
sa1.UR2023_name.value_counts()

UR2023_name
Auckland          8690
Christchurch      2523
Wellington        1404
Hamilton          1096
Tauranga           959
                  ... 
Orua Bay             1
Aramoana             1
St Arnaud            1
Birdlings Flat       1
Pounawea             1
Name: count, Length: 652, dtype: int64

Now if we use `groupby` we can apply built in or even our own functions to groups of data.

In [16]:
grouped_df = sa1.groupby("UR2023_name")
grouped_df.sum()

Unnamed: 0_level_0,"Subject pop: Census usually resident population count, Year: 2013, Measure: Count, Var1: Census usually resident population count (Total)","Subject pop: Census usually resident population count, Year: 2018, Measure: Count, Var1: Census usually resident population count (Total)","Subject pop: Census usually resident population count, Year: 2023, Measure: Count, Var1: Census usually resident population count (Total)","Subject pop: Census night population count, Year: 2013, Measure: Count, Var1: Census night population count (Total)","Subject pop: Census night population count, Year: 2018, Measure: Count, Var1: Census night population count (Total)","Subject pop: Census night population count, Year: 2023, Measure: Count, Var1: Census night population count (Total)","Subject pop: Census usually resident population count, Year: 2013, Measure: Count, Var1: Age (5-year groups - 0-4 years)","Subject pop: Census usually resident population count, Year: 2013, Measure: Count, Var1: Age (5-year groups - 5-9 years)","Subject pop: Census usually resident population count, Year: 2013, Measure: Count, Var1: Age (5-year groups - 10-14 years)","Subject pop: Census usually resident population count, Year: 2013, Measure: Count, Var1: Age (5-year groups - 15-19 years)",...,"Subject pop: Census usually resident population count aged 5 years and over, Year: 2023, Measure: Count, Var1: Difficulty washing (Some difficulty)","Subject pop: Census usually resident population count aged 5 years and over, Year: 2023, Measure: Count, Var1: Difficulty washing (A lot of difficulty)","Subject pop: Census usually resident population count aged 5 years and over, Year: 2023, Measure: Count, Var1: Difficulty washing (Cannot do at all)","Subject pop: Census usually resident population count aged 5 years and over, Year: 2023, Measure: Count, Var1: Difficulty washing (Not elsewhere included)","Subject pop: Census usually resident population count aged 5 years and over, Year: 2023, Measure: Count, Var1: Difficulty washing (Total)","Subject pop: Census usually resident population count aged 5 years and over, Year: 2023, Measure: Count, Var1: Difficulty washing (Total stated)",Area square kilometres,Land area square kilometres,Shape__Area,Shape__Length
UR2023_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Ahaura,96,96,102,102,117,108,3,3,6,6,...,3,0,0,21,93,72,0.386275,0.386275,3.863386e+05,2939.846528
Ahipara,1029,1230,1272,1098,1257,1308,69,81,87,54,...,36,3,0,381,1185,810,4.528173,4.528173,4.527639e+06,28027.275865
Akaroa,642,756,633,1125,3372,807,21,12,30,24,...,18,6,0,105,621,504,2.029935,2.029935,2.029934e+06,19274.399059
Alexandra,4806,5466,5598,4863,5598,5844,228,279,315,213,...,252,39,30,549,5322,4755,9.760936,9.760936,9.760529e+06,85424.686372
Algies Bay,651,762,723,663,768,750,36,21,24,21,...,36,18,21,105,708,606,0.672530,0.672530,6.724908e+05,11123.566031
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Ōtautau,702,753,777,690,735,774,48,54,51,42,...,24,9,3,102,723,615,2.600478,2.600478,2.600463e+06,20653.759426
Ōtorohanga,2622,3024,3174,2679,3075,3228,204,201,165,168,...,132,27,15,708,2931,2226,5.074022,5.074022,5.074078e+06,48453.544360
Ōtākou-Harington Point,162,192,195,174,195,198,6,6,12,6,...,15,0,0,36,186,153,5.651539,5.651539,5.651761e+06,14437.719824
Ōtāne,540,666,759,537,654,750,30,39,30,27,...,33,6,0,132,687,549,0.938055,0.938055,9.381905e+05,9928.948581


To apply a function to grouped data you use `agg()` in place of `apply()`.

In [17]:
grouped_df.agg(unevenness)

Unnamed: 0_level_0,"Subject pop: Census usually resident population count, Year: 2013, Measure: Count, Var1: Census usually resident population count (Total)","Subject pop: Census usually resident population count, Year: 2018, Measure: Count, Var1: Census usually resident population count (Total)","Subject pop: Census usually resident population count, Year: 2023, Measure: Count, Var1: Census usually resident population count (Total)","Subject pop: Census night population count, Year: 2013, Measure: Count, Var1: Census night population count (Total)","Subject pop: Census night population count, Year: 2018, Measure: Count, Var1: Census night population count (Total)","Subject pop: Census night population count, Year: 2023, Measure: Count, Var1: Census night population count (Total)","Subject pop: Census usually resident population count, Year: 2013, Measure: Count, Var1: Age (5-year groups - 0-4 years)","Subject pop: Census usually resident population count, Year: 2013, Measure: Count, Var1: Age (5-year groups - 5-9 years)","Subject pop: Census usually resident population count, Year: 2013, Measure: Count, Var1: Age (5-year groups - 10-14 years)","Subject pop: Census usually resident population count, Year: 2013, Measure: Count, Var1: Age (5-year groups - 15-19 years)",...,"Subject pop: Census usually resident population count aged 5 years and over, Year: 2023, Measure: Count, Var1: Difficulty washing (Some difficulty)","Subject pop: Census usually resident population count aged 5 years and over, Year: 2023, Measure: Count, Var1: Difficulty washing (A lot of difficulty)","Subject pop: Census usually resident population count aged 5 years and over, Year: 2023, Measure: Count, Var1: Difficulty washing (Cannot do at all)","Subject pop: Census usually resident population count aged 5 years and over, Year: 2023, Measure: Count, Var1: Difficulty washing (Not elsewhere included)","Subject pop: Census usually resident population count aged 5 years and over, Year: 2023, Measure: Count, Var1: Difficulty washing (Total)","Subject pop: Census usually resident population count aged 5 years and over, Year: 2023, Measure: Count, Var1: Difficulty washing (Total stated)",Area square kilometres,Land area square kilometres,Shape__Area,Shape__Length
UR2023_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Ahaura,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.0,1.0,1.0,1.0,...,1.0,0,0,1.0,1.000000,1.0,1.000000,1.000000,1.000000,1.000000
Ahipara,0.144642,0.130660,0.134612,0.145988,0.131857,0.137267,,,,,...,0.166667,1.0,0,0.137826,0.134831,0.138272,0.257710,0.257710,0.257685,0.158381
Akaroa,0.170670,0.191515,0.174906,0.187228,0.598433,0.181797,0.22449,0.375,0.18,0.1875,...,0.277778,1.0,0,0.172245,0.174450,0.178005,0.187719,0.187719,0.187731,0.173328
Alexandra,0.028883,0.028052,0.028360,0.028765,0.028181,0.028393,,,,,...,,,,,0.028355,,0.088270,0.088270,0.088276,0.042168
Algies Bay,0.175009,0.174096,0.179456,0.174505,0.173370,0.176480,0.208333,0.22449,0.25,0.22449,...,0.25,0.722222,1.0,0.2,0.180157,0.180816,0.209293,0.209293,0.209382,0.182996
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Ōtautau,0.153992,0.154966,0.156930,0.155085,0.154219,0.157352,0.164062,0.148148,0.204152,0.193878,...,0.1875,0.333333,1.0,0.190311,0.156936,0.154741,0.250942,0.250942,0.250851,0.193042
Ōtorohanga,0.053053,0.053743,0.055188,0.053700,0.054188,0.055280,0.062284,0.059924,0.061818,0.063776,...,0.073347,0.358025,0.28,0.056844,0.055274,0.056179,0.088551,0.088551,0.088539,0.061855
Ōtākou-Harington Point,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.0,1.0,1.0,1.0,...,1.0,0,0,1.0,1.000000,1.0,1.000000,1.000000,1.000000,1.000000
Ōtāne,0.215432,0.211671,0.221484,0.213695,0.211977,0.221504,0.34,0.242604,0.22,0.234568,...,0.272727,0.5,0,0.242769,0.222250,0.231449,0.206829,0.206829,0.206816,0.200866


We can even apply functions to the results of applying other functions:

In [18]:
display(grouped_df.sum().apply(unevenness, axis = "columns"))

UR2023_name
Ahaura                    0.909342
Ahipara                   0.910732
Akaroa                    0.879462
Alexandra                 0.828646
Algies Bay                0.700872
                            ...   
Ōtautau                   0.899677
Ōtorohanga                0.821761
Ōtākou-Harington Point    0.984261
Ōtāne                     0.790839
Ōwhango                   0.899430
Length: 652, dtype: float64