In [1]:
import logging
from lib.gnaf_db import GnafDb
from lib.service.io import IoService
from lib.tasks.fetch_static_files import initialise, get_session

from lib import notebook_constants as nc

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

io = IoService.create(None)

async with get_session(io) as session:
    environment = await initialise(io, session)

gnaf = GnafDb.create()
gnaf.wait_till_running()

gnaf_2 = GnafDb.create(nc.gnaf_dbconf_2, nc.gnaf_dbname_2)
gnaf_2.wait_till_running()

2024-09-24 12:02:37,648 - INFO - Checking Target "non_abs_shape.zip"
2024-09-24 12:02:37,649 - INFO - Checking Target "cities.zip"
2024-09-24 12:02:37,649 - INFO - Checking Target "g-naf_aug24_allstates_gda2020_psv_1016.zip"
2024-09-24 12:02:37,649 - INFO - Checking Target "nswvg_lv_01_Sep_2024.zip"
2024-09-24 12:02:37,650 - INFO - Checking Target "nswvg_wps_01_Jan_2024.zip"
2024-09-24 12:02:37,650 - INFO - Checking Target "nswvg_wps_08_Jan_2024.zip"
2024-09-24 12:02:37,650 - INFO - Checking Target "nswvg_wps_15_Jan_2024.zip"
2024-09-24 12:02:37,650 - INFO - Checking Target "nswvg_wps_22_Jan_2024.zip"
2024-09-24 12:02:37,651 - INFO - Checking Target "nswvg_wps_29_Jan_2024.zip"
2024-09-24 12:02:37,651 - INFO - Checking Target "nswvg_wps_05_Feb_2024.zip"
2024-09-24 12:02:37,651 - INFO - Checking Target "nswvg_wps_12_Feb_2024.zip"
2024-09-24 12:02:37,651 - INFO - Checking Target "nswvg_wps_19_Feb_2024.zip"
2024-09-24 12:02:37,652 - INFO - Checking Target "nswvg_wps_26_Feb_2024.zip"
2024-0

## Link GNAF with land values

Now for a messy part. There's a few obstacles in the way before we can link the GNAF dataset with the addresses in the NSW valuer general dataset. Here are a few

- No direct identifer found in both on a property basis
- Inconsistent street naming conventions
- Some properties are missing fields you'd expect to use to link data `suburb` and `street_name`


In [2]:
import numpy as np
import pandas as pd
from fuzzywuzzy import fuzz
from scipy.optimize import linear_sum_assignment

engine = gnaf.engine()

with gnaf.connect() as conn:
    cursor = conn.cursor()
    cursor.execute("DROP TABLE IF EXISTS nsw_valuer_general.property_assoc")
    cursor.execute("DROP TABLE IF EXISTS nsw_valuer_general.street_assoc")
    cursor.execute("DROP TABLE IF EXISTS nsw_valuer_general.suburb_assoc")
    
    with open('sql/nsw_lv_schema_4_links.sql', 'r') as f:
        cursor.execute(f.read())
    cursor.close()

suburbs_df = pd.read_sql("""
SELECT DISTINCT su.suburb_id, su.suburb_name, su.district_code
  FROM nsw_valuer_general.property p
  LEFT JOIN nsw_valuer_general.suburb su ON p.suburb_id = su.suburb_id
 ORDER BY su.suburb_name
 -- WHERE su.suburb_name = 'ADAMINABY'
""", engine)

def check_col_is_unique(df, col, msg, throw=False):
    if df[col].is_unique:
        return None
        
    indexes = df[col].duplicated(keep=False)
    display(df[indexes])
    print(msg % df[indexes][col].unique())
    
    if throw:
        raise Exception("Failed unique check")
    else:
        return indexes

def calculate_cost_matrix(df_a, df_b):
    cost_matrix = []
    for _, row_a in df_a.iterrows():
        row_costs = []
        for _, row_b in df_b.iterrows():
            score = 100 - fuzz.ratio(row_a['street_name'], row_b['gnaf_street_name'])
            row_costs.append(score)
        cost_matrix.append(row_costs)
    
    # Padding the cost matrix to ensure square shape
    max_dim = max(len(df_a), len(df_b))
    padded_cost_matrix = np.full((max_dim, max_dim), 100)  # High cost for padding
    padded_cost_matrix[:len(df_a), :len(df_b)] = cost_matrix
    
    return padded_cost_matrix
    
unmatched_rows = []

for index, suburb in suburbs_df.iterrows():
    print(suburb['suburb_name'])
    
    vg_streets_df = pd.read_sql("""
        SELECT *
          FROM nsw_valuer_general.street
         WHERE suburb_id = %(suburb_id)s
    """, engine, params={
        "suburb_id": suburb['suburb_id'],
    })
    
    if len(vg_streets_df) == 0:
        print('no matches')
        raise Exception()

    postcodes = tuple(vg_streets_df['postcode'].unique())

    gnaf_s_df = pd.read_sql("""
        SELECT DISTINCT 
            sl.street_locality_pid as gnaf_street_locality_pid,
            sl.street_name,
            sl.street_type_code,
            sl.street_name 
              || COALESCE(' ' || sta.name, '')
              || COALESCE(' ' || ssa.name, '')
              as gnaf_street_name,
            l.locality_name as suburb_name,
            ad.postcode

          FROM gnaf.ADDRESS_DETAIL ad
          LEFT JOIN gnaf.locality l ON ad.locality_pid = l.locality_pid
          LEFT JOIN gnaf.STREET_LOCALITY sl ON sl.street_locality_pid = ad.street_locality_pid
          LEFT JOIN gnaf.STATE s ON l.state_pid = s.state_pid
          LEFT JOIN gnaf.STREET_TYPE_AUT sta ON sta.code = sl.street_type_code
          LEFT JOIN gnaf.STREET_SUFFIX_AUT ssa ON ssa.code = sl.street_suffix_code
          
         WHERE l.locality_name = %(suburb_name)s
           AND ad.postcode IN %(postcodes)s
           AND s.state_abbreviation = 'NSW'
    """, engine, params={
        "suburb_name": suburb['suburb_name'],
        "postcodes": postcodes,
    })

    for postcode in postcodes:
        v_slice = vg_streets_df[vg_streets_df['postcode'] == postcode].reset_index(drop=True)
        g_slice = gnaf_s_df[gnaf_s_df['postcode'] == postcode].reset_index(drop=True)
        
        check_col_is_unique(g_slice,
                            'gnaf_street_name',
                            f"{suburb['suburb_name']} non unique streets %s",
                            throw=True)
        
        if len(g_slice) == 0:
            unmatched_rows.append(v_slice)
            continue
        
        cost_matrix = calculate_cost_matrix(v_slice, g_slice)
        row_ind, col_ind = linear_sum_assignment(cost_matrix)
    
        v_slice['gnaf_street_name'] = [
            g_slice.iloc[col]['gnaf_street_name'] if col < len(gnaf_s_df) else None
            for col in col_ind[:len(v_slice)]
        ]
        v_slice['gnaf_street_name_cost'] = [
            cost_matrix[idx, col_idx]
            for idx, col_idx in enumerate(col_ind[:len(v_slice)])
        ]

        v_slice['drop'] = False
        v_slice.loc[v_slice['gnaf_street_name'].isna(), 'drop'] = True
        v_slice.loc[v_slice['gnaf_street_name_cost'] > 20, 'drop'] = True

        if not v_slice[v_slice['drop']].empty:
            unmatched = v_slice[v_slice['drop']]
            display(unmatched[['street_name', 'gnaf_street_name', 'gnaf_street_name_cost']])
            unmatched_rows.append(unmatched)
            
        display(v_slice[~v_slice['drop']])
        
        v_slice = v_slice[~v_slice['drop']]\
            .merge(g_slice, how='left', on=['gnaf_street_name', 'postcode'])
        
        street_assoc_df = v_slice[['street_id', 'gnaf_street_locality_pid']]
        street_assoc_df.to_sql('street_assoc',
                               engine,
                               schema='nsw_valuer_general',
                               if_exists='append',
                               index=False)


AARONS PASS


Unnamed: 0,street_id,street_name,district_code,suburb_id,postcode,gnaf_street_name,gnaf_street_name_cost,drop
0,77,AARONS PASS RD,620,1,2850,AARONS PASS RD,0,False
1,13100,BOMBANDI RD,620,1,2850,BOMBANDI RD,0,False
2,22126,CASTLEREAGH HWY,620,1,2850,CASTLEREAGH HWY,0,False
3,30292,CRUDINE RD,620,1,2850,CRUDINE RD,0,False
4,91032,PERRAMS RD,620,1,2850,PERRAMS RD,0,False
5,110157,SUTTERS LANE,620,1,2850,SUTTERS LANE,0,False
6,121404,WATTLE BRAE LANE,620,1,2850,WATTLE BRAE LANE,0,False


ABBOTSBURY


Unnamed: 0,street_id,street_name,district_code,suburb_id,postcode,gnaf_street_name,gnaf_street_name_cost,drop
0,6538,BALSON CL,220,2,2176,BALSON CL,0,False
1,6610,BANCROFT RD,220,2,2176,BANCROFT RD,0,False
2,9165,BEGOVICH CRES,220,2,2176,BEGOVICH CR,8,False
3,9457,BELLETTE CL,220,2,2176,BELLETTE CL,0,False
4,14837,BRACK CL,220,2,2176,BRACK CL,0,False
5,26059,COCHRAN PL,220,2,2176,COCHRAN PL,0,False
6,26851,COMIN PL,220,2,2176,COMIN PL,0,False
7,27069,CONDOR CL,220,2,2176,CONDOR CL,0,False
8,29156,COWPASTURE RD,220,2,2176,COWPASTURE RD,0,False
9,31306,DALBERTIS ST,220,2,2176,DALBERTIS ST,0,False


ABBOTSFORD


Unnamed: 0,street_id,street_name,district_code,suburb_id,postcode,gnaf_street_name,gnaf_street_name_cost,drop
0,117,ABBOTSFORD COVE DR,139,3,2046,ABBOTSFORD COVE DR,0,False
1,119,ABBOTSFORD PDE,139,3,2046,ABBOTSFORD PDE,0,False
2,2369,ALTONA ST,139,3,2046,ALTONA ST,0,False
3,8257,BATTERSEA ST,139,3,2046,BATTERSEA ST,0,False
4,10967,BICKLEIGH ST,139,3,2046,BICKLEIGH ST,0,False
5,12040,BLACKWALL POINT RD,139,3,2046,BLACKWALL POINT RD,0,False
6,20183,CAMPBELL ST,139,3,2046,CAMPBELL ST,0,False
7,23617,CHARLTON ST,139,3,2046,CHARLTON ST,0,False
8,23686,CHATHAM PL,139,3,2046,CHATHAM PL,0,False
9,23741,CHECKLEY ST,139,3,2046,CHECKLEY ST,0,False


ABERCROMBIE


Unnamed: 0,street_id,street_name,district_code,suburb_id,postcode,gnaf_street_name,gnaf_street_name_cost,drop
0,211,ABERCROMBIE DR,608,4,2795,ABERCROMBIE DR,0,False
1,3575,APPLEBLOSSOM GR,608,4,2795,APPLEBLOSSOM GR,0,False
2,8378,BAYLISS ST,608,4,2795,BAYLISS ST,0,False
3,28036,COPEMAN CT,608,4,2795,COPEMAN CT,0,False
4,28923,COUNTRY WAY,608,4,2795,COUNTRY WAY,0,False
5,29172,COWPASTURES GR,608,4,2795,COWPASTURES GR,0,False
6,32068,DARWIN DR,608,4,2795,DARWIN DR,0,False
7,32179,DAVIDSON ST,608,4,2795,DAVIDSON ST,0,False
8,35735,DUNOON PL,608,4,2795,DUNOON PL,0,False
9,37164,EGLINTON RD,608,4,2795,EGLINTON RD,0,False


ABERCROMBIE RIVER


Unnamed: 0,street_id,street_name,district_code,suburb_id,postcode,gnaf_street_name,gnaf_street_name_cost,drop
0,6253,BALD RIDGE RD,608,5,2795,BALD RIDGE RD,0,False
1,22523,CAVES RD,608,5,2795,CAVES RD,0,False
2,30782,CURRAGH RD,608,5,2795,CURRAGH RD,0,False
3,40104,FALLS RD,608,5,2795,FALLS RD,0,False
4,47657,GOULBURN RD,608,5,2795,GOULBURN RD,0,False


ABERDARE


Unnamed: 0,street_name,gnaf_street_name,gnaf_street_name_cost
8,DUFFIE DR,QUARRYBYLONG ST,83


Unnamed: 0,street_id,street_name,district_code,suburb_id,postcode,gnaf_street_name,gnaf_street_name_cost,drop
0,232,ABERDARE RD,1,6,2325,ABERDARE RD,0,False
1,15142,BRANDIS ST,1,6,2325,BRANDIS ST,0,False
2,19569,CALEDONIAN ST,1,6,2325,CALEDONIAN ST,0,False
3,23021,CESSNOCK ST,1,6,2325,CESSNOCK ST,0,False
4,26497,COLLIERY ST,1,6,2325,COLLIERY ST,0,False
5,26832,COMERFORD CL,1,6,2325,COMERFORD CL,0,False
6,27104,CONGEWAI ST,1,6,2325,CONGEWAI ST,0,False
7,30937,CURRY ST,1,6,2325,CURRY ST,0,False
9,49105,GRETA ST,1,6,2325,GRETA ST,0,False
10,61532,KEARSLEY ST,1,6,2325,KEARSLEY ST,0,False


ABERDEEN


Unnamed: 0,street_name,gnaf_street_name,gnaf_street_name_cost
0,HALLS LANE,HALL ST,41


Unnamed: 0,street_id,street_name,district_code,suburb_id,postcode,gnaf_street_name,gnaf_street_name_cost,drop
1,83082,NEW ENGLAND HWY,7,7,2336,NEW ENGLAND HWY,0,False
2,122185,WELLS GULLY RD,7,7,2336,WELLS GULLY RD,0,False


ABERDEEN


KeyboardInterrupt: 