In [1]:
import pandas as pd
import numpy as np
import polars as pl
import fastexcel

In [2]:
# Read in file
df = pl.read_excel("data/Population estimates and components by SA2.xlsx", 
                   sheet_name="Table 2",
                   read_options={
                    "skip_rows": 6,
                    }
                   )
# Column names change
df.columns = ["GCCSA code",	"GCCSA name", "SA4 code", "SA4 name", "SA3 code",
              "SA3 name", "SA2 code", "SA2 name","2023 Pop", "2024 Pop",
              "2023-24 Change", "2023-24 Change %", "Natural Increase", "Net internal migration",
              "Net oveseas migration","Area","Population Density"]

In [3]:
pl.Config.set_tbl_rows(30)
df_stg = (
    df
        .select("SA3 name","SA3 code","SA2 name", "SA2 code", "2024 Pop", "2023-24 Change", "Area")
        .head(df.height - 2)
        .with_columns(
            pl.when(pl.col("SA2 name").str.contains("-"))
            .then(
                pl.col("SA2 name")
                    .str.extract(r"^(.*?)-", 1)
                    .str.strip_chars(" ")  # remove leading/trailing spaces
                )
            .otherwise(pl.col("SA2 name"))
            .alias("Suburb")
        )
        .with_columns(
            pl.when(pl.col("SA3 name").str.contains("-"))
            .then(
                pl.col("SA3 name")
                    .str.extract(r"^(.*?)-", 1)
                    .str.strip_chars(" ")  # remove leading/trailing spaces
                )
            .otherwise(pl.col("SA3 name"))
            .alias("Council")
        )
        .select("Council","Suburb","2024 Pop","2023-24 Change","Area")

        .group_by(["Council","Suburb"])
        .agg(
            pl.sum("2024 Pop").alias("2024 Pop"),
            pl.sum("2023-24 Change").alias("2023-24 Change"),
            pl.sum("Area").alias("Suburb Area km^2")
        )
        # Add average pop
        .with_columns(
            pl.col("2024 Pop")
            .mean()
            .over("Council")
            .round(0).cast(pl.Int64) 
            .alias("avg_suburb_pop_in_Council")
        )
        # Add average area
        .with_columns(
            pl.col("Suburb Area km^2")
            .mean()
            .over("Council")
            .round(0).cast(pl.Int64) 
            .alias("avg_suburb_area_in_Council")
        )
        .sort("Council", descending=False)

        .select("Council","Suburb","2024 Pop","avg_suburb_pop_in_Council",
                "Suburb Area km^2","avg_suburb_area_in_Council",
                "2023-24 Change")
        
)

In [16]:
df_map = pl.read_csv("data/mapping_fnl.csv").with_columns(pl.col("Suburb/Town Name").str.replace("Melbourne", "Melbourne CBD"))

In [17]:
pl.Config.set_tbl_rows(700)
df_map.sort(by='Suburb/Town Name')

Suburb/Town Name,Region,Local Government Area,Postcode
str,str,str,i64
"""Abbotsford""","""Metro Inner""","""Yarra""",3067.0
"""Aberfeldie""","""Other""",,
"""Aintree""","""Metro Outer West""","""Melton""",3336.0
"""Airport West""","""Metro Inner West""","""Moonee Valley""",3042.0
"""Albanvale""","""Metro Inner West""",,
"""Albert Park""","""Metro Inner""","""Port Phillip""",3206.0
"""Albion""","""Metro Inner West""","""Brimbank""",3020.0
"""Alexandra""","""Other""",,
"""Alfredton""","""Other""","""Ballarat""",3350.0
"""Alphington""","""Metro Inner North""","""Yarra""",3078.0


In [10]:
df_stg2 = (
    df_stg
    .join(
        df_map,
        how='left',
        left_on='Suburb',
        right_on='Suburb/Town Name'
    )
    .filter(pl.col("Suburb")=='Templestowe')
    .select(["Council","Suburb","2024 Pop",'Region'])
    )

# (
#     df_stg
#     .filter(
#         pl.col("Suburb") == 'Templestowe'
#     )
# )
df_stg2

Council,Suburb,2024 Pop,Region
str,str,i64,str
"""Manningham""","""Templestowe""",17467,
