# Address Parsing Synthetic Data Generation Code

### Overview

The following script was used to generate data used to finetune the deepparse library.

### Imports

In [1]:
# Install the follow packages
# !pip install Faker==23.1.0 nlpaug==1.1.11 numpy==1.26.4 pandas==2.0.3 tqdm==4.66.1

In [2]:
import pandas as pd
import numpy as np
import random as rand
import nlpaug.augmenter.char as nac
import nlpaug.augmenter.word as naw
import string
import copy
from tqdm import tqdm
from copy import deepcopy
from faker import Faker
from typing import List 
from numpy.random import choice
from numpy.testing import assert_almost_equal
from string import ascii_lowercase, digits

  from .autonotebook import tqdm as notebook_tqdm


### Dictionaries

The following dictionaries contains abbreviations and equivalences. to_non_stand_X maps a standard term to a non standard equivalent term. to_stand builds the inverse map. TODO: Add more elements to to_non_stand.

In [3]:
street_dir_en = [
    "e", "east", "e.",
    "n", "north", "n.", "nrt", "nrth", "northern",
    "ne",
        "northeast",
        "north east",
        "northe",
        "north e",
        "neast",
        "n east",
        "ne",
        "n e",
        "nrteast",
        "nrt east",
        "nrte",
        "nrt e",
        "nrtheast",
        "nrth east",
        "nrthe",
        "nrth e",
        "ntheast",
        "nth east",
        "nthe",
        "nth e",
        "norheast",
        "norh east",
        "norhe",
        "norh e",
        "norteast",
        "nort east",
        "norte",
        "nort e",
    "nw",
        "northwest",
        "north west",
        "northw",
        "north w",
        "northwst",
        "north wst",
        "nwest",
        "n west",
        "n w",
        "nwst",
        "n wst",
        "nrtwest",
        "nrt west",
        "nrtw",
        "nrt w",
        "nrtwst",
        "nrt wst",
        "nrthwest",
        "nrth west",
        "nrthw",
        "nrth w",
        "nrthwst",
        "nrth wst",
        "nthwest",
        "nth west",
        "nthw",
        "nth w",
        "nthwst",
        "nth wst",
        "norhwest",
        "norh west",
        "norhw",
        "norh w",
        "norhwst",
        "norh wst",
        "nortwest",
        "nort west",
        "nortw",
        "nort w",
        "nortwst",
        "nort wst",
    "s", "south", "so", "sth",
    "se",
        "southeast",
        "south east",
        "southe",
        "south e",
        "seast",
        "s east",
        "s e",
        "soeast",
        "so east",
        "so e",
        "stheast",
        "sth east",
        "sthe",
        "sth e",
    "sw",
        "southwest",
        "south west",
        "southw",
        "south w",
        "southwst",
        "south wst",
        "swest",
        "s west",
        "s w",
        "swst",
        "s wst",
        "sowest",
        "so west",
        "so w",
        "sowst",
        "so wst",
        "sthwest",
        "sth west",
        "sthw",
        "sth w",
        "sthwst",
        "sth wst",
    "w", "west", "wst",
]

street_dir_fr = [
    "e", "est", "e.",
    "n", "nord", "n.",
    "ne",
        "nord-est",
    "nw",
        "nord ouest"
    "s", "sud",
    "se",
        "sud-est",
    "sw",
        "sud ouest",
    "w", "ouest",
]

province_lst = ["on", "ab", "bc", "ns", "nb", "pe", "qc", "sk", "mb", "nt"]

to_non_stand_prov = {
    "ab": ["alberta"],
    "bc": ["british columbia", "b.c.", "b. c."],
    "mb": ["manitoba"],
    "nb": ["new brunswick", "n.b."],
    "ns": ["nova scotia", "n.s.", "n. s."],
    "nt": ["northwest territories", "n.t.", "nwt"],
    "nu": ["nunavut"],
    "on": ["ontario"],
    "pe": ["prince edward island", "pei", "p. e. i.", "p.e.i"],
    "qc": ["quebec"],
    "sk": ["saskatchewan"],
    "yt": ["yukon", "the yukon", "yukon territory"],
    "nl": ["newfoundland and labrador", "n. l.", "n.l."],
}

to_non_stand_street_dir = {
    "e": ["east", "e."],
    "n": ["north", "n.", "nrt", "nrth", "northern"],
    "ne": [
        "northeast",
        "north east",
        "northe",
        "north e",
        "neast",
        "n east",
        "ne",
        "n e",
        "nrteast",
        "nrt east",
        "nrte",
        "nrt e",
        "nrtheast",
        "nrth east",
        "nrthe",
        "nrth e",
        "ntheast",
        "nth east",
        "nthe",
        "nth e",
        "norheast",
        "norh east",
        "norhe",
        "norh e",
        "norteast",
        "nort east",
        "norte",
        "nort e",
    ],
    "nw": [
        "northwest",
        "north west",
        "northw",
        "north w",
        "northwst",
        "north wst",
        "nwest",
        "n west",
        "n w",
        "nwst",
        "n wst",
        "nrtwest",
        "nrt west",
        "nrtw",
        "nrt w",
        "nrtwst",
        "nrt wst",
        "nrthwest",
        "nrth west",
        "nrthw",
        "nrth w",
        "nrthwst",
        "nrth wst",
        "nthwest",
        "nth west",
        "nthw",
        "nth w",
        "nthwst",
        "nth wst",
        "norhwest",
        "norh west",
        "norhw",
        "norh w",
        "norhwst",
        "norh wst",
        "nortwest",
        "nort west",
        "nortw",
        "nort w",
        "nortwst",
        "nort wst",
    ],
    "s": ["south", "so", "sth"],
    "se": [
        "southeast",
        "south east",
        "southe",
        "south e",
        "seast",
        "s east",
        "s e",
        "soeast",
        "so east",
        "so e",
        "stheast",
        "sth east",
        "sthe",
        "sth e",
    ],
    "sw": [
        "southwest",
        "south west",
        "southw",
        "south w",
        "southwst",
        "south wst",
        "swest",
        "s west",
        "s w",
        "swst",
        "s wst",
        "sowest",
        "so west",
        "so w",
        "sowst",
        "so wst",
        "sthwest",
        "sth west",
        "sthw",
        "sth w",
        "sthwst",
        "sth wst",
    ],
    "w": ["west", "wst"],
}

to_non_stand_num_term = {
    "unit": [
        "suite",
        "apt",
        "apartment",
        "ste",
    ],  # unit is special case stuff like 123-43 king should eval to unit 123 43 king
    "po box": [
        "post office box",
        "p.o. box",
        "p.o box",
        "pobox",
        "postbox",
        "post box",
        "post box",
        "p o box",
        "p office box",
        "pb",
        "pob",
        "p o b",
        "post mail box",
        "post mail box",
        "pmb",
        "box",
        "box",
        "box",
    ],
    "ph" : [
        "penthouse",
        "pent house",
        "penth",
        "penthse",
        "pHouse",
        "pent.",
        "ph",
    ],
    "rr" : ["rr#", "rural road", "r road", "country road", "side road"]
}

to_non_stand_street_type = {
    "alley": ["aly", "ally"],
    "alwy": ["alleyway", "allyway", "allwy"],
    "anx": ["annex", "anex", "annx"],
    "arc": ["arcade"],
    "ave": ["avenue", "av", "aven", "avenu", "avn", "avnue"],
    "bayou": ["bayoo"],
    "bch": ["beach"],
    "bldg": ["building", "bdg", "bld", "blg"],
    "blf": ["bluf", "bluff"],
    "blfs": ["bluffs"],
    "blk": ["block"],
    "blvd": ["boulevard", "boul", "boulv"],
    "bnd": ["bend"],
    "bnglw": ["bungalow", "bongalow", "bunglow", "bungalo", "bngw"],
    "bot": ["bottom", "bottm", "btm"],
    "br": ["branch", "brnch"],
    "brg": ["bridge", "brdge"],
    "brgs": ["burgs"],
    "brk": ["brook", "break"],
    "brks": ["brooks"],
    "bsmt": ["basement", "bsm", "bsmnt", "basement", "bsment"],
    "byps": ["bypass", "bypa", "bps", "byp"],
    "cabin": ["cabn"],
    "circ": ["circle", "cir", "circel", "circles"],
    "clf": ["cliff"],
    "clfs": ["cliffs"],
    "cmn": ["common"],
    "cmns": ["commons"],
    "cmp" : ["camp"],
    "cnyn" : ["canyon", "cyn"],
    "cor" : ["corner"],
    "cors" : ["corners"],
    "cpe": ["cape"],
    "cresc": ["crescent", "crs", "crecent"],
    "crk": ["creek"],
    "crossing": ["xing", "crssing"],
    "crossroad" : ["cross road", "xroad"],
    "cswy": ["causeway", "cause way"],
    "ct" : ["court"],
    "ctyd" : ["court yard", "courtyard"],
    "ctr": ["center", "centre", "cent", "centr", "cnter", "cntr"],
    "curv" : ["curve"],
    "cv" : ["cove"],
    "cvs" : ["coves"],
    "dl" : ["dale"],
    "dm" : ["dam"],
    "div" : ["divide"],
    "dr": ["drive", "drv", "driv"],
    "drs" : ["drives"],
    "dupl": ["duplex", "dup"],
    "dvwy": ["driveway", "dway", "drive way"],
    "est": ["estate"],
    "ests" : ["estates"],
    "exp": ["express", "expressway", "express way", "expwy", "expw", "expr"],
    "fcty": ["fty", "fy"],
    "fl": [
        "floor",
        "flr",
        "f",
        "level",
        "lev",
        "levl",
        "lvel",
        "lvl",
        "platform",
        "pf",
        "storey",
    ],
    "fld": ["field"],
    "flds" : ["fields"],
    "fls" : ["falls"],
    "flt": ["flat"],
    "frd": ["ford"],
    "frds" : ["fords"],
    "frg" : ["forge", "forg"],
    "frgs" : ["forges"],
    "frk": ["fork"],
    "frst": ["forest"],
    "fry": ["ferry", "frry"],
    "fwy": ["freeway", "free way", "freewy"],
    "gdfl": [
        "ground floor",
        "gdfl",
        "gd fl",
        "gd/fl",
        "gd / fl",
        "gf",
        "ground level",
        "gd lvl",
        "g lvl",
        "g level",
        "gd level",
        "ground lvl",
        "lobby",
        "main floor",
    ],
    "gpo": ["general post office box", "gpo box"],
    "grdn": ["garden", "gardn"],
    "grn" : ["green"],
    "grns" : ["greens"],
    "grv": ["grove", "grov"],
    "grvs": ["groves"],
    "gym": ["gymnasium"],
    "hbr" : ["harbour", "hrbor"],
    "hbrs" : ["harbours"],
    "hl": ["hill"],
    "hls" : ["hills"],
    "hllw": ["hollow", "holw"],
    "hotl": ["hotel", "hot", "htel"],
    "hs": ["high school", "h s"],
    "hsp": ["hospital", "hos", "hosp", "hospice", "hosptl", "hsptl"],
    "hts": ["heights"],
    "hvn": ["haven"],
    "hwy": ["highway", "highwy"],
    "jct": ["junction", "jction", "juncton"],
    "jcts": ["junctions"],
    "ky" : ["key"],
    "kys" : ["keys"],
    "knl" : ["knoll"],
    "knls" : ["knolls"],
    "lck": ["lock"],
    "lcks": ["locks"],
    "ldg": ["lodge", "lodg", "ldge"],
    "lgt": ["light"],
    "lk" : ["lake"],
    "lks" : ["lakes"],
    "ln": ["lane"],
    "lp": ["loop"],
    "lwr": ["lower", "lowr"],
    "mdw": ["meadow"],
    "mdws" : ["meadows"],
    "ml" : ["mill"],
    "mls" : ["mills"],
    "mnr": ["manor"],
    "mnrs" : ["manors"],
    "msn" : ["mission"],
    "mt": ["mount"],
    "mtn": ["mountain"],
    "mtns": ["mountains"],
    "mtwy": ["motorway", "motor way"],
    "nck" : ["neck"],
    "orch": ["orchard", "orchrd"],
    "ovl": ["oval"],
    "ovlk" : ["overlook", "over look"],
    "opas" :  ["overpass","over pass"],
    "pkwy": ["parkway", "parkwy", "pkway", "prkwy", "prkway", "park way"],
    "pl": ["place", "pla", "plc", "plac"],
    "plz": ["plaza", "plza"],
    "prk": ["park"],
    "prt": ["port"],
    "psge" : ["passage"],
    "pt": ["point"],
    "pts": ["points"],
    "pth": ["path"],
    "pthwy": ["phwy","pway","pthway","pathway","ptway","ptwy", "pathwy"],
    "rd": ["road"],
    "rds" : ["roads"],
    "rdg": ["ridge", "rdge"],
    "rnch": ["ranch"],
    "rte": ["route"],
    "rvr": ["river", "riv", "rivr"],
    "skwy": ["skyway", "sky way"],
    "smt": ["summit"],
    "spg": ["spring", "sprng", "spng"],
    "spgs": ["springs"],
    "sq": ["square", "sqr"],
    "st" : ["str","stre","stree","strt", "street", "streets"],
    "stn": ["station", "statn", "sta"],
    "tce": ["terrace", "ter", "tr", "terr", "terace", "terrac", "terrasse", "tsse"],
    "tpke": ["turnpike", "trnpk", "turnpk"],
    "trfy": ["trafficway"],
    "trl": ["trail"],
    "trwy": ["throughway", "through way"],
    "twr": ["tower", "towers"],
    "u": ["university", "uni", "univ", "univers", "unvrsty"],
    "un": ["union"],
    "upas": ["underpass"],
    "upr": ["upper", "uppr", "up"],
    "vdct": ["viaduct", "viadct"],
    "vis": ["vista", "vst", "vsta", "vist"],
    "vl": ["ville"],
    "vlg": ["village", "vge", "vllg"],
    "vly": ["valley", "vallys", "vlly"],
    "vlys": ["valleys"],
    "wl" : ["wells"],
    "wy": ["way"],
}

to_non_std = {
    "st": ["saint", "st."],
    "1st" : ["first"],
    "2nd" : ["second"],
    "3rd" : ["third"],
    "4th" : ["fourth"],
    "5th" : ["fifth"],
    "6th" : ["sixth"],
    "7th" : ["seventh"],
    "8th" : ["eighth"],
    "9th" : ["ninth"],
}

to_stand_street_type = {i: k for k, v in to_non_stand_street_type.items() for i in v}
to_stand_street_dir = {i: k for k, v in to_non_stand_street_dir.items() for i in v}
to_std = {i : k for k,v in to_non_std.items() for i in v}

street_type = list(to_non_stand_street_type.keys())
street_dir = list(to_non_stand_street_dir.keys())


### Shared Functions

In [4]:
def prob(p: float) -> bool:
    "Returns true with probability p"
    
    return rand.uniform(0, 1) < p


def insert_spaces(s: string, p:float)->string:
    "Returns s with space inserted between adjacent characters with prob p"

    s = list(s)
    for i in range(len(s)-1):
        if prob(p):
            s[i] = s[i] + ' '
    return ''.join(s)

The following functions take in a string and performs synthetic OCR augments on them.

In [5]:
aug_split = naw.SplitAug(min_char=3) #augments string by splitting it
conf_matrix = pd.read_csv("ocr_confusion_matrix.csv", index_col=0) #conf matrix gives prob that a letter is augmented to another letter
conf_matrix_num_to_letter = pd.read_csv("ocr_confusion_matrix_num_to_letter.csv", index_col=0) #conf matrix gives prob that a letter is augmented to another letter, num can only become a num

def pc_augment(pc : str, letter_aug = 0.1, num_aug = 0.2) -> str:  
    """
    pc - postal code
    Used to augment postal code terms. For each char in pc, modify each letter with prob letter_aug and each digit with prog num_aug
    """

    return "".join([c if c not in conf_matrix_num_to_letter.columns or (c.isdigit() and prob(1 - num_aug)) or (not c.isdigit() and prob(1 - letter_aug)) else rand.choices(conf_matrix_num_to_letter.columns, weights=conf_matrix_num_to_letter.loc[c], k = 1)[0] for c in pc])


def num_augment(s : str, letter_aug = 0.05, num_aug = 0.05) -> str:
    """
    s - num
    Used to augment num terms (street numbers, units). For each char in s, modify each letter with prob letter_aug and each digit with prog num_aug    
    """

    return "".join([c if c not in conf_matrix_num_to_letter.columns or (c.isdigit() and prob(num_aug)) or (not c.isdigit() and prob(letter_aug)) else rand.choices(conf_matrix_num_to_letter.columns, weights=conf_matrix_num_to_letter.loc[c], k = 1)[0] for c in s ])


def rm_space_augment(s : str, keep_ws_p = 0.9) -> str:
    "Returns s with every whitespace being kept with prob keep_ws_p"

    return "".join([w for w in s if w not in string.whitespace or prob(keep_ws_p)])


def augment(s : str, letter_aug = 0.85, num_aug = 0.98, keep_p = 1) -> str:
    """
    s - string
    Used to augment general strings. For each char in s, modify each letter with prob letter_aug and each digit with prog num_aug    
    """

    return "".join([c if c not in conf_matrix.columns or (c.isdigit() and prob(num_aug) or (not c.isdigit() and prob(letter_aug))) else choice(conf_matrix.columns, p=conf_matrix.loc[c]) for c in s if prob(keep_p)])

The following functions generate synthetic address terms

In [6]:
fake_en = Faker("en_CA") # init faker library for english canadian address terms
fake_fr = Faker("fr_CA") # init faker library for french canadian address terms

SUFFIXES = {1: "st", 2: "nd", 3: "rd"}

def rand_ordinal_street():
    "returns a random ordinal street address"

    num = rand.randint(1, 120)
    suffix = "th" if 10 <= num % 100 <= 20 else SUFFIXES.get(num % 10, "th")
    if prob(0.3):
        return str(num)
    
    return str(num) + suffix


def rand_unit_num():
    "returns a random unit term"

    options = [f"{rand.randint(1, 120)}", f"{rand.randint(1, 9999)}", f"{rand.randint(1, 9999)}{rand.choice(ascii_lowercase)}"] 
    return choice(options, p=[0.5, 0.35, 0.15])

def rand_rr():
    "returns a random rural road term"

    rroptions = [f"rr {rand.randint(1, 120)}", f"rr{rand.randint(1, 120)}"]
    return rand.choice(rroptions)

gen_parsed_tags is the main driver function which takes in the raw address data and returns an address as well as the tags of each word in the address (address, tags)

In [13]:
def get_insertion_index(tags: List[str]):
    """
    tags - list of address tags (eg. ["street_no", "street_name", "postal_code"])
    returns a random valid index which we could insert another address term. We dont want to insert in between certain terms like streetnumber and street name it would fundamentally change the address
    """
    insertable_indices = [0, len(tags)]
    for i in range(1, len(tags)):
        if tags[i][:5] != tags[i - 1][:5]:
            insertable_indices.append(i)

    return rand.choice(insertable_indices)


def gen_parsed_tags(wt : List[str], tt: List[str], is_en = True) -> tuple[str, ...]:
    """
    wt (word terms) - list of address terms (eg. ["76", "abc", "street"])
    tt (tags terms) - the address tag for each address term in wt (eg. ["street_no", "street_name", "street_name"] for ["76", "abc", "street"])
    is_en - whether the address is french or english

    returns a pair, an address as well as the tags of each word in the address. The trained deepparse should parse the address as the tags
    """

    assert len(wt) == len(tt)

    # perform rearrangments on wt, tt. Want to mix up ordering of address terms since not all addresses come in the same order. wt and tt must undergo the same rearrangment.
    r = rand.uniform(0,1)
    if r < 0.1:
        str_wt, str_tt = [], []
        nstr_wt, nstr_tt = [], []

        # We take out street info (street_no, street_name, street_dir), unit info and pobox info as their relative order must be preserved
        for i in range(len(wt)):
            if tt[i][:3] == "str" or tt[i] == "unit" or tt[i] == "pobox":
                str_wt.append(wt[i])
                str_tt.append(tt[i])
            else:
                nstr_wt.append(wt[i])
                nstr_tt.append(tt[i])
        
        # Randomly shuffle the remaining terms
        msk = list(range(len(nstr_wt)))
        rand.shuffle(msk)
        wt = [nstr_wt[i] for i in msk]
        tt = [nstr_tt[i] for i in msk]

        # Insert removed terms back in
        iind = get_insertion_index(tt)
        wt,tt = wt[:iind] + str_wt + wt[iind:], tt[:iind] + str_tt + tt[iind:]
    elif r < 0.65:
        # Just swap positions of two address terms
        swappable_inds = [i for i in range(len(wt)) if not (tt[i][:3] == "str" or tt[i] == "unit" or tt[i] == "pobox")]
        if len(swappable_inds) > 2:
            i, j = rand.sample(swappable_inds, 2)
            wt[i], wt[j] = wt[j], wt[i]
            tt[i], tt[j] = tt[j], tt[i]
    elif r < 0.8 and "postal_code" in tt:
        # Just move location of postal code
        i = tt.index("postal_code")
        swappable_inds = [i for i in range(len(wt)) if tt[i][:3] != "str"]
        if len(swappable_inds) > 1:
            j = rand.choice(swappable_inds)
            wt[i], wt[j] = wt[j], wt[i]
            tt[i], tt[j] = tt[j], tt[i]


    new_wt, new_tt = [], []

    # Iterate overall address terms. For each term add the standardized term to std_words and a non_standard equiv or abbrev to nstd_words
    # We also drop address terms with a probability so the standardizer can learn to standardize incomplete addresses
    keep_street = prob(0.99) 

    # Learn to also parse addresses with duplicate terms (streetnames, city names)
    pot_dup_words, pot_dup_tags = [], []

    # Chance we also move the unit term to the street_no term so 165-43 abc street
    nstsno = None
    for i in range(len(wt)):
        if tt[i] == "Municipality":
            new_wt.append(wt[i])
            new_tt.append(tt[i])
            if prob(0.8):
                pot_dup_words.append(wt[i])
                pot_dup_tags.append(tt[i])
        elif tt[i] == "PostalCode":
            pc = wt[i].replace(" ", "")
            if len(pc) < 6 or prob(0.2):
                continue
            else:
                if wt[i][3] != " " and len(wt[i])==6:
                    wt[i] = wt[i][:3] + " " + wt[i][3:]
                wt[i] = choice([wt[i], pc[:3], pc[3:]], p=[0.94, 0.02, 0.02])
                new_wt.append(wt[i])
                new_tt.append(tt[i])
                if prob(0.8):
                    pot_dup_words.append(wt[i])
                    pot_dup_tags.append(tt[i])
        elif tt[i] == "Province":
            nst = wt[i] if wt[i] not in to_non_stand_prov or prob(0.6) else rand.choice(to_non_stand_prov[wt[i]])
            new_wt.append(nst)
            new_tt.append(tt[i])
            if prob(0.6):
                pot_dup_words.append(nst)
                pot_dup_tags.append(tt[i])
        elif tt[i] == "Country":
            nst = wt[i] if prob(0.7) else "canada"
            new_wt.append(nst)
            new_tt.append(tt[i])
            if prob(0.6):
                pot_dup_words.append(nst)
                pot_dup_tags.append(tt[i])
        elif tt[i] == "RuralRoad":
            new_wt.append(wt[i])
            new_tt.append(tt[i])
        elif tt[i] == "PoBox":
            unit_sep = rand.choice(["#", "# ", " #", " # ", "", " ", " ", " "])
            new_wt.append(rand.choice([f"po box{unit_sep}{wt[i]} ", f"{rand.choice(to_non_stand_num_term["po box"])}{unit_sep}{wt[i]}"]))
            new_tt.append(tt[i])

        # The following terms must all be dropped or none are dropped
        if keep_street:
            if tt[i] == "Unit":
                unit_sep = rand.choice(["#", "# ", " #", " # ", "", " ", " ", " ", " ", " ", " ", " ", " "])
                new_wt.append(choice([f"unit{unit_sep}{wt[i]}", f"{unit_sep}{wt[i]}".strip(), f"{rand.choice(to_non_stand_num_term["unit"])}{unit_sep}{wt[i]}"], p=[0.2, 0.5, 0.3]))
                new_tt.append(tt[i])
            elif tt[i] == "ph":
                unit_sep = rand.choice(["#", "# ", " #", " # ", "", "", " ", " ", " "])
                new_wt.append(rand.choice([f"ph {unit_sep}{wt[i]} ", f"{rand.choice(to_non_stand_num_term["ph"])}{unit_sep}{wt[i]}"]))
                new_tt.append("Unit")
            elif tt[i] == "StreetNumber":
                new_wt.append(wt[i])
                new_tt.append(tt[i])
            elif tt[i] == "StreetName":
                sst = " ".join([w if w not in to_std else to_std[w] for w in wt[i].split()])
                nst = " ".join([w if w not in to_non_std or prob(0.7) else rand.choice(to_non_std[w]) for w in sst.split()])
                new_wt.append(nst)
                new_tt.append(tt[i])
                if prob(0.8):
                    pot_dup_words.append(nst)
                    pot_dup_tags.append(tt[i])
    
    if prob(0.03):
        new_wt.extend(pot_dup_words)
        new_tt.extend(pot_dup_tags)

    wt,tt = new_wt, new_tt
    # Apply OCR augmentation to address
    if prob(0.2): 
        words_new, tags_new = [], []
        for i in range(len(wt)):
            augword = wt[i]
            if (tt[i] == "postal_code"):
                augword = pc_augment(augword, letter_aug= 0.9, num_aug= 0.8)
            elif (tt[i] != "unit" and tt[i]!="pobox" and tt[i]!="street_no" and prob(0.3)):
                augword = augment(augword)
            elif (tt[i] == "unit" or tt[i] =="pobox" or tt[i]=="street_no" and prob(0.03)):
                augword = num_augment(augword)
            if (tt[i] != "unit" and tt[i]!="pobox" and tt[i]!="street_no" and prob(0.02)):
                augword = aug_split.augment(augword)[0]
            
            tags_new.extend([tt[i]] * len(augword.split()))
            words_new.extend(augword.split())                
        wt, tt = words_new, tags_new

    res_wt, res_tt = [],[]
    for i in range(len(wt)):
        res_wt.extend(wt[i].split())
        res_tt.extend([tt[i]]*len(tt[i].split()))

    return res_wt, res_tt

### Loading Data

Load data from spreadsheets and perform data cleaning

In [8]:
statcan_data = []
statcan_relevant_cols = ["street_no", "street", "city", "postal_code", "Province"]
dtypes = {key: str for key in statcan_relevant_cols}
for prov in province_lst: 
    df = pd.read_csv(f"data\ODA_{prov.capitalize()}_v1.csv", dtype=dtypes).drop_duplicates(subset=["full_addr"])
    df["Province"] = prov
    df = df[(df["street"] != "UNASSIGNED")& (df["postal_code"] != "UNASSIGNED")& (df["street_no"].str.contains("-") == False)]
    df = df.dropna(subset  = ["str_name"])
    df = df.drop(df[df["postal_code"].isnull()].sample(frac=0.75).index)
    statcan_data.append(df)


statcan_df = pd.concat(statcan_data, ignore_index=True)
statcan_df.rename(columns={"unit" : "Unit", "street_no" : "StreetNumber", "street": "StreetName", "postal_code": "PostalCode", "city": "Municipality"}, inplace=True)
statcan_df["PoBox"] = ""
statcan_df["RuralRoad"] = ""
statcan_df["Address"] = ""
statcan_df["Tags"] = ""
statcan_df.head()

cpcdf = pd.read_csv("data\CanadianPostalCodes202312.csv")
cpcdf = cpcdf[["POSTAL_CODE", "CITY", "PROVINCE_ABBR"]]
cpcdf.rename(columns={"POSTAL_CODE": "PostalCode", "CITY": "Municipality", "PROVINCE_ABBR": "Province"}, inplace=True)
cpcdf["Unit"] = ""
cpcdf["PoBox"] = ""
cpcdf["StreetNumber"] = ""
cpcdf["StreetName"] = ""
cpcdf["StreetName"] = "" 
cpcdf["Address"] = ""
cpcdf["Tags"] = ""

city_df = pd.read_csv("data\cgn_canada_csv_eng.csv")
city_df = city_df[(city_df['Generic Category']== "Populated Place") & (city_df['Language']== "Undetermined")].drop_duplicates(subset=['Geographical Name'])
city_df['Address'] = city_df['Geographical Name'].apply(lambda city : city.lower())
city_df['Tags']=city_df['Address'].apply(lambda city : ["Municipality"]*len(city.split()))
city_df = city_df[['Address', 'Tags']]

  df = pd.read_csv(f"data\ODA_{prov.capitalize()}_v1.csv", dtype=dtypes).drop_duplicates(subset=["full_addr"])
  cpcdf = pd.read_csv("data\CanadianPostalCodes202312.csv")
  city_df = pd.read_csv("data\cgn_canada_csv_eng.csv")
  df = pd.read_csv(f"data\ODA_{prov.capitalize()}_v1.csv", dtype=dtypes).drop_duplicates(subset=["full_addr"])
  df = pd.read_csv(f"data\ODA_{prov.capitalize()}_v1.csv", dtype=dtypes).drop_duplicates(subset=["full_addr"])
  df = pd.read_csv(f"data\ODA_{prov.capitalize()}_v1.csv", dtype=dtypes).drop_duplicates(subset=["full_addr"])
  df = pd.read_csv(f"data\ODA_{prov.capitalize()}_v1.csv", dtype=dtypes).drop_duplicates(subset=["full_addr"])
  df = pd.read_csv(f"data\ODA_{prov.capitalize()}_v1.csv", dtype=dtypes).drop_duplicates(subset=["full_addr"])
  df = pd.read_csv(f"data\ODA_{prov.capitalize()}_v1.csv", dtype=dtypes).drop_duplicates(subset=["full_addr"])
  df = pd.read_csv(f"data\ODA_{prov.capitalize()}_v1.csv", dtype=dtypes).drop_duplicates(subset=["full_addr"

In [14]:
for i, row in tqdm(statcan_df.iterrows(), total=statcan_df.shape[0]):
    wt, tt = [], []
    illegal_pc = ["VACANT", "QUEENS", "RMC", "na", "UNKNOWN", "UNASSIGNED"]
    
    if prob(0.3):
        wt.append(rand_unit_num())
        tt.append("Unit")
    elif prob(0.4):
        wt.append(choice([f"{rand.randint(1,9999)}", f"{rand.randint(1,99999)}"], p= [0.8, 0.2]))
        tt.append("PoBox")
    elif prob(0.1):
        wt.append(rand_unit_num())
        tt.append("ph")

    row["RuralRoad"] = rand_rr() if prob(0.05) else ""
    row["PostalCode"] = row["PostalCode"] if row["PostalCode"] not in illegal_pc else ""

    for tag in ["StreetNumber", "StreetName", "RuralRoad", "Municipality", "Province"]:
        if isinstance(row[tag], str) and not row[tag].isspace() and not row[tag] == "":
            tt.append(tag)
            wt.append(row[tag].lower())

    if prob(0.4):
        wt.append("ca")
        tt.append("Country")

    wt,tt = gen_parsed_tags(wt, tt, is_en=row["Province"].lower() != "qc")
    statcan_df.at[i, "Address"] = " ".join(wt)
    statcan_df.at[i, "Tags"] = f"{tt}"

statcan_df = statcan_df[["Address", "Tags"]]
statcan_df.head()

  9%|▉         | 148249/1583771 [00:57<14:08, 1692.00it/s]

In [None]:
for i, row in tqdm(cpcdf.iterrows(), total=cpcdf.shape[0]):
    wt, tt = [], []
    
    if row["Province"].lower() != "qc":
        row["StreetName"] = fake_en.street_name().lower() if prob(0.995) else rand_ordinal_street()
        if prob(0.05):
            row["StreetName"] += " " + rand.choice(street_dir_en)
    if row["Province"].lower() == "qc":
        row["StreetName"]= fake_fr.street_name().lower() if prob(0.995) else rand_ordinal_street() 
        if prob(0.05):
            row["StreetName"] += " " + rand.choice(street_dir_fr)
    
    row["StreetNumber"] = f"{rand.randint(1,9999)}" if prob(0.95) else ""
    row["RuralRoad"] = rand_rr() if prob(0.05) else ""

    if prob(0.2):
        wt.append(rand_unit_num())
        tt.append("Unit")
    elif prob(0.3):
        wt.append(choice([f"{rand.randint(1,9999)}", f"{rand.randint(1,99999)}"], p= [0.8, 0.2]))
        tt.append("PoBox")
    elif prob(0.1):
        wt.append(rand_unit_num())
        tt.append("ph")
    
    rel_tags = ["StreetNumber", "StreetName", "RuralRoad", "Municipality", "PostalCode", "Province"]
    for tag in rel_tags:
        if isinstance(row[tag], str) and not row[tag].isspace() and not row[tag] == "":
            tt.append(tag)
            wt.append(row[tag].lower())

    if prob(0.3):
        wt.append("ca")
        tt.append("Country")

    wt,tt = gen_parsed_tags(wt, tt, is_en=row["Province"].lower() != "qc")
    cpcdf.at[i, "Address"] = " ".join(wt)
    cpcdf.at[i, "Tags"] = f"{tt}"

cpcdf = cpcdf[["Address", "Tags"]]
cpcdf.head()

Unnamed: 0,Address,Tags
0,pb#7193 6125 david ports rr 5 pembroke ontario...,"['PoBox', 'StreetNumber', 'StreetName', 'Stree..."
1,9366 blvd bolduc jonquiere g8a 1y9 qc,"['StreetNumber', 'StreetName', 'StreetName', '..."
2,ca nl a2h lb8 ste# 5437b 6652 gamble groves co...,"['Country', 'Province', 'PostalCode', 'PostalC..."
3,9250 anderson mtns athabasca t9s 1n3,"['StreetNumber', 'StreetName', 'StreetName', '..."
4,8848 smith divide grande prairie alberta,"['StreetNumber', 'StreetName', 'StreetName', '..."


Some lesser seen postal codes which are sometimes incorrectly standardized. We generate 1000 addresses for each of these postal codes.

In [None]:
import copy

pc_city_map = {
    "P0B 1A0": "Baysville",
    "P0B 1E0": "Milford Bay",
    "P0B 1G0": "Minett",
    "P0B 1J0": "Port Carling",
    "P0B 1K0": "Port Sandfield",
    "P0B 1L0": "Port Sydney",
    "P0B 1M0": "Utterson",
    "P0B 1P0": "Windermere",
    "P0H 1A0": "Arnstein",
    "P0H 1B0": "Astorville",
    "P0H 1C0": "Bear Island",
    "P0H 1E0": "Bonfield",
    "P0H 1G0": "Cache Bay",
    "P0H 1H0": "Callander",
    "P0H 1J0": "Commanda",
    "P0H 1K0": "Corbeil",
    "P0H 1L0": "Crystal Falls",
    "P0H 1M0": "Field",
    "P0H 1N0": "Golden Valley",
    "P0H 1P0": "Hornell Heights",
    "P0H 1R0": "Lavigne",
    "P0H 1S0": "Loring",
    "P0H 1T0": "Marten River",
    "P0H 1V0": "Mattawa",
    "P0H 1W0": "Nipissing",
    "P0H 1Y0": "Port Loring",
    "P0H 1Z0": "Powassan",
    "P0H 2A0": "Redbridge",
    "P0H 2C0": "River Valley",
    "P0H 2E0": "Rutherglen",
    "P0H 2H0": "Temagami",
    "P0H 2J0": "Thorne",
    "P0H 2K0": "Tilden Lake",
    "P0H 2L0": "Trout Creek",
    "P0H 2M0": "Verner",
    "P0H 2N0": "Warren",
    "P0H 2R0": "Restoule",
    "V0N 1A0":" Alert Bay",
    "V0N 1B0":" Whistler",
    "V0N 1E0":" Blubber Bay",
    "V0N 1G0":" Bowen Island",
    "V0N 1H0":" Brackendale",
    "V0N 1J0":" Britannia Beach",
    "V0N 1K0":" Coal Harbour",
    "V0N 1L0":" D'Arcy",
    "V0N 1M0":" Dawsons Landing",
    "V0N 1N0":" Egmont",
    "V0N 1P0":" Galiano Island",
    "V0N 1S0":" Garden Bay",
    "V0N 1T0":" Garibaldi Highlands",
    "V0N 1V0":" Gibsons",
    "V0N 1W0":" Gillies Bay",
    "V0N 1Z0":" Holberg",
    "V0N 2B0":" Kingcome Inlet",
    "V0N 2E0":" Lions Bay",
    "V0N 2G0":" Lund",
    "V0N 2H0":" Madeira Park",
    "V0N 2H1":" Pender Harbour",
    "V0N 2J0":" Mayne Island",
    "V0N 2K0":" Mount Currie",
    "V0N 2L0":" Pemberton",
    "V0N 2M0":" Pender Island",
    "V0N 2N0":" Port Alice",
    "V0N 2P0":" Port Hardy",
    "V0N 2R0":" Port McNeill",
    "V0N 2S0":" Port Mellon",
    "V0N 2V0":" Quatsino",
    "V0N 2W0":" Roberts Creek",
    "V0N 2Y0":" Saturna",
    "V0N 3A0":" Sechelt",
    "V0N 3B0":" Seton Portage",
    "V0N 3C0":" Shalalth",
    "V0N 3E0":" Sointula",
    "V0N 3J0":" Telegraph Cove",
    "V0N 3K0":" Van Anda",
    "V0N 3L0":" Winter Harbour",
    "V0N 3P0":" Woss",
    "V0N 3V0":" Gibsons",
    "V0N 3Z0":" Furry Creek",
    "V0R 1A0":" Ahousat",
    "V0R 1B0":" Bamfield",
    "V0R 1G0":" Bowser",
    "V0R 1H0":" Cassidy",
    "V0R 1K0":" Chemainus",
    "V0R 1L0":" Cobble Hill",
    "V0R 1M0":" Coombs",
    "V0R 1N0":" Cowichan Bay",
    "V0R 1R0":" Crofton",
    "V0R 1S0":" Cumberland",
    "V0R 1T0":" Denman Island",
    "V0R 1V0":" Errington",
    "V0R 1W0":" Fanny Bay",
    "V0R 1X0":" Gabriola",
    "V0R 1Y0":" Honeymoon Bay",
    "V0R 1Z0":" Hornby Island",
    "V0R 2B0":" Kildonan",
    "V0R 2C0":" Koksilah",
    "V0R 2G0":" Lake Cowichan",
    "V0R 2H0":" Lantzville",
    "V0R 2J0":" Lasqueti",
    "V0R 2K0":" Lazo",
    "V0R 2L0":" Malahat",
    "V0R 2M0":" Merville",
    "V0R 2N0":" Mesachie Lake",
    "V0R 2P0":" Mill Bay",
    "V0R 2V0":" Royston",
    "V0R 2W0":" Shawnigan Lake",
    "V0R 2Y0":" Thetis Island",
    "V0R 2Z0":" Tofino",
    "V0R 3A0":" Ucluelet",
    "V0R 3B0":" Union Bay",
    "V0R 3C0":" Westholme",
    "V0R 3E1":" Youbou",
    "V0R 4K0":" Chemainus",
    "V0R 5K0":" Penelakut Island",
    "L0N 1A0":" Alton",
    "L0N 1B0":" Belfountain",
    "L0N 1C0":" Caledon Village",
    "L0N 1E0":" Caledon East",
    "L0N 1G0":" Grand Valley",
    "L0N 1H0":" Honeywood",
    "L0N 1J0":" Horning's Mills",
    "L0N 1K0":" Inglewood",
    "L0N 1L0":" Laurel",
    "L0N 1M0":" Mansfield",
    "L0N 1N0":" Orton",
    "L0N 1P0":" Palgrave",
    "L0N 1R0":" Rosemont",
    "L0N 1S0":" Shelburne",
    "N0N 1A0":" Alvinston",
    "N0N 1B0":" Brigden",
    "N0N 1C0":" BrightΓÇÖs Grove",
    "N0N 1E0":" Camlachie",
    "N0N 1G0":" Corunna",
    "N0N 1H0":" Courtright",
    "N0N 1J0":" Forest",
    "N0N 1K0":" Inwood",
    "N0N 1M0":" Mooretown",
    "N0N 1N0":" Oil City",
    "N0N 1P0":" Oil Springs",
    "N0N 1R0":" Petrolia",
    "N0N 1T0":" Wyoming",
    "A0N 1A0":" Aguathuna",
    "A0N 1B0":" Barachois Brook",
    "A0N 1C0":" Cape Ray",
    "A0N 1E0":" Cape St. George",
    "A0N 1G0":" Cartyville",
    "A0N 1H0":" Codroy",
    "A0N 1J0":" Doyles",
    "A0N 1K0":" Grand Bay East",
    "A0N 1M0":" Heatherton",
    "A0N 1N0":" Highlands",
    "A0N 1P0":" Jeffrey's",
    "A0N 1R0":" Lourdes",
    "A0N 1S0":" Noels Pond",
    "A0N 1T0":" Port au Port",
    "A0N 1V0":" Robinsons",
    "A0N 1W0":" St. Andrew's",
    "A0N 1X0":" St. David's",
    "A0N 1Y0":" St. Fintan's",
    "A0N 1Z0":" St. George's",
    "A0N 2B0":" South Branch",
    "A0N 2C0":" Stephenville Crossing",
    "A0N 2E0":" West Bay Centre",
    "A0N 2G0":" Black Duck Siding",
    "A0N 2H0":" Burgeo",
    "A0N 2J0":" Ramea",
    "A0N 2K0":" Fran├ºois",
    "A0N 2L0":" Grey River",
    "B0N 1C0":" Brookfield",
    "B0N 1E0":" Centre Burlington",
    "B0N 1G0":" Cheverie",
    "B0N 1H0":" Curry's Corner",
    "B0N 1J0":" Densmore Mills",
    "B0N 1K0":" Elderbank",
    "B0N 1L0":" Ellershouse",
    "B0N 1P0":" Kennetcook",
    "B0N 1T0":" Maitland",
    "B0N 1V0":" Meaghers Grant",
    "B0N 1W0":" Micmac",
    "B0N 1X0":" Middle Musquodoboit",
    "B0N 1Y0":" Milford Station, Milford",
    "B0N 1Z0":" Mount Uniacke",
    "B0N 2A0":" Newport",
    "B0N 2B0":" Newport Station",
    "B0N 2C0":" Noel",
    "B0N 2E0":" Ste-Croix",
    "B0N 2G0":" Scotch Village",
    "B0N 2H0":" Shubenacadie",
    "B0N 2J0":" Stewiacke",
    "B0N 2K0":" Summerville",
    "B0N 2L0":" Upper Kennetcook",
    "B0N 2M0":" Upper Musquodoboit",
    "B0N 2N0":" Upper Rawdon",
    "B0N 2P0":" Upper Stewiacke",
    "B0N 2R0":" Walton",
    "B0N 2T0":" Windsor",
    "B0N 3A0":" Ellershouse",
    "G0N 1B0":" Saint-Joseph-de-Coleraine, Saint-Julien",
    "G0N 1C0":" Sainte-Clotilde-de-Beauce",
    "G0N 1E0":" Disraeli",
    "G0N 1E1":" Sainte-Prax├¿de",
    "G0N 1E2":" Saint-Jacques-le-Majeur-de-Wolfestown",
    "G0N 1G0":" Sacr├⌐-C┼ôur-de-J├⌐sus",
    "G0N 1H0":" East Broughton",
    "G0N 1J0":" Saint-Jacques-de-Leeds",
    "G0N 1K0":" Kinnear's Mills",
    "G0N 1M0":" Saint-Adrien-d'Irlande",
    "G0N 1N0":" Saint-Ferdinand",
    "G0N 1P0":" Saint-Fr├⌐d├⌐ric",
    "G0N 1R0":" Saint-Jules",
    "G0N 1S0":" Adstock",
    "G0N 1T0":" Saint-Pierre-de-Broughton",
    "G0N 1V0":" Saint-S├⌐verin",
    "G0N 1X0":" Tring-Jonction",
    "S0N 0A0":" Abbey",
    "S0N 0B0":" Admiral",
    "S0N 0C0":" Aneroid",
    "S0N 0E0":" Blumenhof",
    "S0N 0G0":" Bracken",
    "S0N 0H0":" Burstall",
    "S0N 0J0":" Cabri",
    "S0N 0K0":" Cadillac",
    "S0N 0M0":" Claydon",
    "S0N 0N0":" Climax",
    "S0N 0P0":" Consul",
    "S0N 0S0":" Dollard",
    "S0N 0T0":" Eastend",
    "S0N 0V0":" Fox Valley",
    "S0N 0W0":" Frontier",
    "S0N 0X0":" Glenbain",
    "S0N 0Y0":" Golden Prairie",
    "S0N 1A0":" Gull Lake",
    "S0N 1C0":" Hazenmore",
    "S0N 1E0":" Hazlet",
    "S0N 1G0":" Lancer",
    "S0N 1H0":" Leader",
    "S0N 1L0":" Liebenthal",
    "S0N 1M0":" McMahon",
    "S0N 1N0":" Maple Creek",
    "S0N 1P0":" Mendham",
    "S0N 1S0":" Neidpath",
    "S0N 1T0":" Neville",
    "S0N 1V0":" Orkney",
    "S0N 1W0":" Pambrun",
    "S0N 1X0":" Pennant Station",
    "S0N 1Y0":" Piapot",
    "S0N 1Z0":" Ponteix",
    "S0N 2A0":" Portreeve",
    "S0N 2B0":" Prelate",
    "S0N 2E0":" Richmound",
    "S0N 2G0":" Robsart",
    "S0N 2H0":" Sceptre",
    "S0N 2L0":" Shackleton",
    "S0N 2M0":" Shaunavon",
    "S0N 2N0":" Simmie",
    "S0N 2P0":" Stewart Valley",
    "S0N 2R0":" Success",
    "S0N 2S0":" Tompkins",
    "S0N 2T0":" Val Marie",
    "S0N 2V0":" Vanguard",
    "S0N 2W0":" Vidora",
    "S0N 2X0":" Webb",
    "S0N 2Y0":" Wymark",
}

pc_to_prov = {
    "a" : "nl",
    "b" : "ns",
    "g" : "qc",
    "h" : "qc",
    "j" : "qc",
    "k" : "on",
    "l" : "on",
    "m" : "on",
    "n" : "on",
    "p" : "on",
    "r" : "mb",
    "s" : "sk",
    "t" : "ab",
    "v" : "bc"
}

addr = []
tags = []

for k in pc_city_map.keys():
    pc = k.lower()
    for i in range(1000):
        city =  pc_city_map[k].lower()

        wt,tt = [], []
        
        if prob(0.1):
            wt.append(rand_unit_num())
            tt.append("Unit")
        elif prob(0.5):
            wt.append(f"{rand.randint(1,9999)}")
            tt.append("PoBox")

        wt.append(f"{rand.randint(1,9999)}")
        tt.append("StreetNumber")

        if prob(0.995):
            sn = fake_en.street_name().lower()
            wt.append(sn)
            tt.append("StreetName")
        else:
            wt.append(rand_ordinal_street())
            tt.append("StreetName")

        wt.append(city)
        tt.append("Municipality")

        wt.append(pc)
        tt.append("PostalCode")

        wt.append(pc_to_prov[pc[0]])
        tt.append("Province")
        
        if prob(0.4):
            wt.append("ca")
            tt.append("Country")
        
        wt,tt = gen_parsed_tags(wt, tt, is_en=pc_to_prov[pc[0]] != "qc")
                                
        addr.append(" ".join(wt))
        tags.append(f"{tt}")

popc_df = pd.DataFrame({"Address": addr, "Tags": tags})

In [None]:
all_df = pd.concat([statcan_df, cpcdf, city_df, popc_df], ignore_index=True)
all_df = all_df[all_df['Address'].str.len() > 0]
all_df.to_csv('deepparse_train_1.csv', index=False)  
