In [1]:
import pandas as pd
import numpy as np
import random as rand
import nlpaug.augmenter.char as nac
import nlpaug.augmenter.word as naw
import ast
import re
import string
from copy import deepcopy
from faker import Faker

from numpy.random import choice
from numpy.testing import assert_almost_equal, assert_equal
from string import ascii_lowercase, digits
from addr_term_dict import province_lst, cols, street_type, to_non_stand_num_term, to_non_stand_street_type, to_non_stand_street_dir, to_non_stand_prov, to_std, to_stand_street_type, to_stand_street_dir, to_non_std, street_dir


fake_en = Faker("en_CA")
fake_fr = Faker("fr_CA")

aug_split = naw.SplitAug(min_char=3)

deepparse_tags = [
    "Unit",
    "StreetNumber",
    "StreetName",
    "Municipality",
    "Province",
    "PostalCode",
]

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
to_non_stand_street_type.keys()

dict_keys(['alley', 'alwy', 'anx', 'arc', 'ave', 'bayou', 'bch', 'bldg', 'blf', 'blfs', 'blk', 'blvd', 'bnd', 'bnglw', 'bot', 'br', 'brg', 'brgs', 'brk', 'brks', 'bsmt', 'byps', 'cabin', 'circ', 'clf', 'clfs', 'cmn', 'cmns', 'cmp', 'cnyn', 'cor', 'cors', 'cpe', 'cresc', 'crk', 'crossing', 'crossroad', 'cswy', 'ct', 'ctyd', 'ctr', 'curv', 'cv', 'cvs', 'dl', 'dm', 'div', 'dr', 'drs', 'dupl', 'dvwy', 'est', 'ests', 'exp', 'fcty', 'fl', 'fld', 'flds', 'fls', 'flt', 'frd', 'frds', 'frg', 'frgs', 'frk', 'frst', 'fry', 'fwy', 'gdfl', 'gpo', 'grdn', 'grn', 'grns', 'grv', 'grvs', 'gym', 'hbr', 'hbrs', 'hl', 'hls', 'hllw', 'hotl', 'hs', 'hsp', 'hts', 'hvn', 'hwy', 'jct', 'jcts', 'ky', 'kys', 'knl', 'knls', 'lck', 'lcks', 'ldg', 'lgt', 'lk', 'lks', 'ln', 'lp', 'lwr', 'mdw', 'mdws', 'ml', 'mls', 'mnr', 'mnrs', 'msn', 'mt', 'mtn', 'mtns', 'mtwy', 'nck', 'orch', 'ovl', 'ovlk', 'opas', 'pkwy', 'pl', 'plz', 'prk', 'prt', 'psge', 'pt', 'pts', 'pth', 'pthwy', 'rd', 'rds', 'rdg', 'rnch', 'rte', 'rvr', 's

In [4]:
insurace_prefix = [
    "in the following  property",
    "location",
    "property located at",
    "insurace at location",
    "premises",
    "address",
    "location",
    "premises",
    "address",
    "location",
    "premises",
    "address",
    "location",
    "at" ,
    "property",
    "property",
    "property",
    "within the specified property",
    "property situated at",
    "insurance coverage for the location",
    "property residing at",
    "within this property",
    "situated property at",
    "insurance for this location",
    "insurace at",
    "postal address",
    "mailing address",
    "risk_details",
]

insurace_suffix = [
    "your stated interest in this property has been removed",
    "has been removed from this policy along with your name",
    "your interest have been deleted from the policy effective",
    "your stated interest in this property has been removed",
    "has been removed from this policy along with your name",
    "we are therefore removing you",
    "your expressed interest in this property has been revoked",
    "your inclusion in this policy, along with your name, has been omitted",
    "your interests have been expunged from the policy, effective immediately",
    "your specified interest in this property has been eliminated",
]

insurace_suffix_fr = [
    "votre interet designe dans cette propriete a ete retire",
    "votre intérêt déclaré pour cette propriété a été supprimé",
    "votre intérêt déclaré pour cette propriété a été supprimé",
    "votre nom a été retiré de cette police ainsi que votre intérêt",
    "votre intérêt a été supprimé de la police à partir de",
    "votre intérêt déclaré pour cette propriété a été supprimé",
    "votre nom a été retiré de cette police ainsi que votre nom",
    "nous vous retirons donc",
    "votre intérêt exprimé pour cette propriété a été révoqué",
    "votre inclusion dans cette police, ainsi que votre nom, a été omise",
    "vos intérêts ont été effacés de la police, avec effet immédiat",
    "votre intérêt spécifié pour cette propriété a été éliminé"
]

insurace_prefix_fr = [
    "à la propriété suivante",
    "emplacement",
    "propriété située à",
    "assurance à l'emplacement",
    "locaux",
    "adresse",
    "à",
    "au",
    "propriété",
    "au sein de la propriété spécifiée",
    "propriété située à",
    "couverture d'assurance pour l'emplacement",
    "propriété résidant à",
    "au sein de cette propriété",
    "propriété située à",
    "assurance pour cet emplacement",
    "assurance à",
    "adresse postale",
    "adresse postale",
    "détails du risque",
]

### Shared Functions

In [6]:
conf_matrix = pd.read_csv("ocr_confusion_matrix.csv", index_col=0)
conf_matrix_num_to_letter = pd.read_csv("ocr_confusion_matrix_num_to_letter.csv", index_col=0) #confusion matrix that does not map number to letter
assert_almost_equal(np.sum(conf_matrix, axis=1), np.ones(conf_matrix.shape[0]))

def prob(p: float) -> bool:
    return rand.uniform(0, 1) < p

def pc_augment(s : str, letter_aug = 0.85, num_aug = 0.98, keep_p = 1) -> str:  
    return "".join([c if c not in conf_matrix_num_to_letter.columns or (c != '0' and ((c.isdigit() and prob(num_aug)) or (not c.isdigit() and prob(letter_aug)))) else rand.choices(conf_matrix_num_to_letter.columns, weights=conf_matrix_num_to_letter.loc[c], k = 1)[0] for c in s if prob(keep_p)])

def num_augment(s : str, letter_aug = 0.98, num_aug = 0.97, keep_p = 1) -> str:  
    return "".join([c if c not in conf_matrix_num_to_letter.columns or (c.isdigit() and prob(num_aug)) or (not c.isdigit() and prob(letter_aug)) else rand.choices(conf_matrix_num_to_letter.columns, weights=conf_matrix_num_to_letter.loc[c], k = 1)[0] for c in s if prob(keep_p)])

def rm_space_augment(s : str, keep_ws_p = 0.9):
    return "".join([w for w in s if w not in string.whitespace or prob(keep_ws_p)])

def augment(s : str, letter_aug = 0.85, num_aug = 0.98, keep_p = 1) -> str:  
    return "".join([c if c not in conf_matrix.columns or (c.isdigit() and prob(num_aug) or (not c.isdigit() and prob(letter_aug))) else choice(conf_matrix.columns, p=conf_matrix.loc[c]) for c in s if prob(keep_p)])

In [5]:
SUFFIXES = {1: "st", 2: "nd", 3: "rd"}

def rand_ordinal_street():
    num = rand.randint(1, 120)
    suffix = "th" if 10 <= num % 100 <= 20 else SUFFIXES.get(num % 10, "th")
    if prob(0.3):
        return str(num)
    
    return str(num) + suffix

def rand_unit_num():
    options = [f"{rand.randint(1, 120)}", f"{rand.randint(1, 9999)}", f"{rand.randint(1, 9999)}{rand.choice(ascii_lowercase)}"] 
    return choice(options, p=[0.5, 0.35, 0.15])

def rand_rr():
    rroptions = [f"rr {rand.randint(1, 120)}", f"rr{rand.randint(1, 120)}"]
    return rand.choice(rroptions)

def insert_spaces(s, p):
    s = list(s)
    for i in range(len(s)-1):
        if prob(p):
            s[i] = s[i] + ' '
    return ''.join(s)

# def rand_gibberish_address(is_en = True):
#     street = fake_en.street_address() if is_en else fake_fr.street_address()
#     street_sec = fake_en.secondary_address() if is_en else fake_fr.secondary_address()
#     base_addr = rand.choice([street, street_sec, street+street_sec, street_sec+street])
#     gibberish_addr = augment(base_addr, letter_aug=0, num_aug=0, keep_p=0.85).lower()
#     p = 0.85 
#     if len(gibberish_addr.replace(" ", "")) < 20: p = 0.5
#     if len(gibberish_addr.replace(" ", "")) < 10: p = 0.3
#     gibberish_addr =  "".join([w if prob(p) else rand.choice(ascii_lowercase + digits + "-") for w in gibberish_addr])
#     return " ".join([w for w in gibberish_addr.split() if not w.isdigit() and len(w) > 3])


def get_insertion_index(tags, exclude=None):
    insertable_indices = [0, len(tags)]
    for i in range(1, len(tags)):
        if tags[i][:5] != tags[i - 1][:5]:
            insertable_indices.append(i)

    if exclude != None and exclude in insertable_indices:
        insertable_indices.remove(exclude)
    
    assert len(insertable_indices) > 0
    return rand.choice(insertable_indices)

def aug_words_tags(words, tags, p):
    words_new, tags_new = [], []
    for i in range(len(words)):
        augword = words[i]

        if (tags[i] == "postal_code"):
            augword = pc_augment(augword, letter_aug= 0.9, num_aug= 0.8)
        elif (tags[i] != "unit" and tags[i]!="pobox" and tags[i]!="street_no" and prob(p)):
            augword = augment(augword)
        elif (tags[i] != "" and tags[i]!="pobox" and tags[i]!="street_no" and prob(p)):

        elif (tags[i] == "unit" or tags[i] =="pobox" or tags[i]=="street_no" and prob(0.03)):
            og = augword                
            augword = num_augment(augword, letter_aug=0.95, num_aug=0.95)
            # if tags[i] == "street_no" and og != augword:
            #     print(og, augword, sep="   ")
        if (tags[i] != "unit" and tags[i]!="pobox" and tags[i]!="street_no" and prob(0.02)):
            augword = aug_split.augment(augword)[0]
        
        tags_new.extend([tags[i]] * len(augword.split()))
        words_new.extend(augword.split())
            
    return words_new, tags_new

IndentationError: expected an indented block after 'elif' statement on line 59 (1165887534.py, line 61)

In [None]:
def convert(wt, tt, is_en = True):
    # perform rearrangments
    r = rand.uniform(0,1)
    if r < 0.1:
        str_wt, str_tt = [], []
        nstr_wt, nstr_tt = [], []
        for i in range(len(wt)):
            if tt[i][:3] == "str" or tt[i] == "unit" or tt[i] == "pobox":
                str_wt.append(wt[i])
                str_tt.append(tt[i])
            else:
                nstr_wt.append(wt[i])
                nstr_tt.append(tt[i])
        msk = list(range(len(nstr_wt)))
        rand.shuffle(msk)
        wt = [nstr_wt[i] for i in msk]
        tt = [nstr_tt[i] for i in msk]
        iind = get_insertion_index(wt,tt)
        wt,tt = wt[:iind] + str_wt + wt[iind:], tt[:iind] + str_tt + tt[iind:]
    elif r < 0.65:
        swappable_inds = [i for i in range(len(wt)) if not (tt[i][:3] == "str" or tt[i] == "unit" or tt[i] == "pobox")]
        if len(swappable_inds) > 2:
            i, j = rand.sample(swappable_inds, 2)
            wt[i], wt[j] = wt[j], wt[i]
            tt[i], tt[j] = tt[j], tt[i]
    elif r < 0.8 and "postal_code" in tt:
        i = tt.index("postal_code")
        swappable_inds = [i for i in range(len(wt)) if tt[i][:3] != "str"]
        if len(swappable_inds) > 1:
            j = rand.choice(swappable_inds)
            wt[i], wt[j] = wt[j], wt[i]
            tt[i], tt[j] = tt[j], tt[i]

    std_words, nstd_words = [], []
    pot_dup_words, pot_dup_tags = [], []
    nstsno = None
    
    keep_street = prob(0.99)

    for i in range(len(wt)):
        assert isinstance(wt[i], str) == True
        if tt[i] == "city":
            std_words.append(wt[i])
            nstd_words.append(wt[i])
            if prob(0.8):
                pot_dup_words.append(wt[i])
                pot_dup_tags.append(tt[i])
        elif tt[i] == "postal_code":
            pc = wt[i].replace(" ", "")
            if len(pc) < 6 or prob(0.2):
                continue
            else:
                if wt[i][3] != " " and len(wt[i])==6:
                    wt[i] = wt[i][:3] + " " + wt[i][3:]
                    print(wt[i])
                if prob(0.02):
                    wt[i] = pc[:3]
                elif prob(0.02):
                    wt[i] = pc[3:]
                std_words.append(wt[i])
                nstd_words.append(wt[i])
                temp_snapshot = deepcopy(std_words), deepcopy(nstd_words)

                if prob(0.8):
                    pot_dup_words.append(wt[i])
                    pot_dup_tags.append(tt[i])
        elif tt[i] == "province":
            nst = wt[i] if wt[i] not in to_non_stand_prov or prob(0.6) else rand.choice(to_non_stand_prov[wt[i]])
            std_words.append(wt[i])
            nstd_words.append(nst)
            if prob(0.6):
                pot_dup_words.append(nst)
                pot_dup_tags.append(tt[i])
        elif tt[i] == "country":
            assert wt[i] == "ca", f"{wt[i]} {wt} {tt}"
            nst = wt[i] if prob(0.7) else "canada"
            std_words.append(wt[i])
            nstd_words.append(nst)
            if prob(0.5):
                pot_dup_words.append(nst)
                pot_dup_tags.append(tt[i])
        elif tt[i] == "rr":
            std_words.append(wt[i])
            nst = wt[i].replace("rr", rand.choice(to_non_stand_num_term["rr"])) if prob(0.3) else wt[i]
            nstd_words.append(nst)
        elif tt[i] == "pobox":
                std_words.append(f"po box {wt[i]}")
                unit_sep = rand.choice(["#", "# ", " #", " # ", "", " ", " ", " "])
                nstd_words.append(rand.choice([f"po box{unit_sep}{wt[i]} ", f"{rand.choice(to_non_stand_num_term["po box"])}{unit_sep}{wt[i]}"]))
        
        if keep_street:
            if tt[i] == "unit":
                std_words.append(f"unit {wt[i]}")
                if "street_no" in tt and prob(0.2):
                    ind = tt.index("street_no")
                    nstsno = f"{wt[i]}-{wt[ind]}"
                else:
                    unit_sep = rand.choice(["#", "# ", " #", " # ", "", " ", " ", " ", " ", " ", " ", " ", " "])
                    nstd_words.append(choice([f"unit{unit_sep}{wt[i]}", f"{unit_sep}{wt[i]}".strip(), f"{rand.choice(to_non_stand_num_term["unit"])}{unit_sep}{wt[i]}"], p=[0.2, 0.5, 0.3]))
            elif tt[i] == "ph":
                std_words.append(f"ph {wt[i]}")
                unit_sep = rand.choice(["#", "# ", " #", " # ", "", " ", " ", " "])
                nstd_words.append(rand.choice([f"ph {unit_sep}{wt[i]} ", f"{rand.choice(to_non_stand_num_term["ph"])}{unit_sep}{wt[i]}"]))
            elif tt[i] == "street_no":
                std_words.append(wt[i])
                nstd_words.append(wt[i] if nstsno == None else nstsno)
                if prob(0.3):
                    pot_dup_words.append(wt[i])
                    pot_dup_tags.append(tt[i])
            elif tt[i] == "str_name":
                sst = " ".join([w if w not in to_std else to_std[w] for w in wt[i].split()])
                nst = " ".join([w if w not in to_non_std or prob(0.7) else rand.choice(to_non_std[w]) for w in sst.split()])
                std_words.append(sst)
                nstd_words.append(nst)
                pot_dup_words.append(nst)
                pot_dup_tags.append(tt[i])
            elif tt[i] == "str_type"and prob(0.95):
                sst = ""
                if wt[i] in to_stand_street_type:
                    sst = to_stand_street_type[wt[i]]
                else:
                    sst = " ".join([w if w not in to_stand_street_type else to_stand_street_type[w] for w in wt[i].split()])

                nst = " ".join([w if w not in to_non_stand_street_type or prob(0.3) else rand.choice(to_non_stand_street_type[w]) for w in sst.split()])
                std_words.append(sst)
                nstd_words.append(nst)
                pot_dup_words.append(nst)
                pot_dup_tags.append(tt[i])
            elif tt[i] == "str_dir" and prob(0.95):
                sst = wt[i] if wt[i] not in to_stand_street_dir else to_stand_street_dir[wt[i]]
                nst = sst if sst not in to_non_stand_street_dir or prob(0.3) else rand.choice(to_non_stand_street_dir[sst])
                std_words.append(sst)
                nstd_words.append(nst)
                if prob(0.9):
                    pot_dup_words.append(nst)
                    pot_dup_tags.append(tt[i])

        # else:
        #     std_words.append(wt[i])
        #     nstd_words.append(wt[i])

    # if prob(0.03):
    #     nstd_words.extend(pot_dup_words)
    #     tt.extend(pot_dup_tags)
    

    # final augmentation, non standard only 
                    
    tags = tt
    if nstsno != None:
        tt.remove("unit")
    if prob(0.75): 
        nstd_words, tags = aug_words_tags(nstd_words, tt, 0.3)

    nstd_res = " ".join(nstd_words)

    # if prob(0.03):
    #     iind = get_insertion_index(nstd_words,tags)
    #     nstd_words,tags = wt[:iind] + [rand_gibberish_address()] + wt[iind:], tt[:iind] + ["gibberish"] + tt[iind:]
    #     nstd_res = " ".join(nstd_words)
    # elif prob(0.5):

    if prob(0.4):
        nstd_res = rm_space_augment(nstd_res, keep_ws_p=0.8)

    if prob(0.02) and is_en:
        sep = [" ", ""]
        if prob(0.7):
            nstd_res = nstd_res + choice(sep, p=[0.9, 0.1]) + rm_space_augment(augment(rand.choice(insurace_suffix), letter_aug=0.8))
        if prob(0.7):
            nstd_res = rm_space_augment(augment(rand.choice(insurace_prefix), letter_aug=0.8)) + choice(sep, p=[0.9, 0.1]) + nstd_res  

    if prob(0.01) and not is_en:
        sep = [" ", ""]
        if prob(0.7):
            nstd_res = nstd_res + choice(sep, p=[0.9, 0.1]) + rm_space_augment(augment(rand.choice(insurace_suffix_fr), letter_aug=0.8))
        if prob(0.7):
            nstd_res = rm_space_augment(augment(rand.choice(insurace_prefix_fr), letter_aug=0.8)) + choice(sep, p=[0.9, 0.1]) + nstd_res  

    std_res = " ".join(std_words)
    return std_res, nstd_res

In [None]:
for i in range(10000):
    a,b = convert(["egmont", "8096", "watkins","landing", "nw", "v0n 1n0", "ca" ,"bc"], ["city", "street_no", "str_name", "str_type", "str_dir", "postal_code", "country", "province"])
    if "v0n" not in a and "v0n" in "b":
        print(a,b)
    if "von" not in a and "v0n" in "b":
        print(a,b)


### Importing Data

In [9]:
statcan_data = []
statcan_relevant_cols = ["street_no", "street", "city", "postal_code", "Province"]
dtypes = {key: str for key in statcan_relevant_cols}
for prov in province_lst: 
    df = pd.read_csv(f"data\ODA_{prov.capitalize()}_v1.csv", dtype=dtypes).drop_duplicates(subset=["full_addr"])
    df["province"] = prov
    df = df[(df["street"] != "UNASSIGNED")& (df["postal_code"] != "UNASSIGNED")& (df["street_no"].str.contains("-") == False)]
    df = df.dropna(subset  = ["str_name"])
    # df = df.drop(df[df["postal_code"].isnull()].sample(frac=0.5).index)
    statcan_data.append(df)

statcan_df = pd.concat(statcan_data, ignore_index=True)
statcan_df["non-std-address"] = ""
statcan_df["std-address"] = ""
statcan_df.head()

cpcdf = pd.read_csv("data\CanadianPostalCodes202312.csv")
cpcdf = cpcdf[["POSTAL_CODE", "CITY", "PROVINCE_ABBR"]]
cpcdf.rename(columns={"POSTAL_CODE": "postal_code", "CITY": "city", "PROVINCE_ABBR": "province"}, inplace=True)
cpcdf["street_no"] = ""
cpcdf["str_name"] = ""
cpcdf["str_type"] = ""
cpcdf["non-std-address"] = ""
cpcdf["std-address"] = ""

city_df = pd.read_csv("data\cgn_canada_csv_eng.csv")
city_df = city_df[(city_df['Generic Category']== "Populated Place") & (city_df['Language']== "Undetermined")].drop_duplicates(subset=['Geographical Name'])
city_df['std-address'] = city_df['Geographical Name'].apply(lambda city : city.lower())
city_df['non-std-address']=city_df['std-address'].apply(lambda city : augment(city.lower(), letter_aug=0.5) if prob(0.9) else aug_split.augment(augment(city.lower(), letter_aug=0.5))[0])
city_df = city_df[['non-std-address', 'std-address']]

  df = pd.read_csv(f"data\ODA_{prov.capitalize()}_v1.csv", dtype=dtypes).drop_duplicates(subset=["full_addr"])
  cpcdf = pd.read_csv("data\CanadianPostalCodes202312.csv")
  city_df = pd.read_csv("data\cgn_canada_csv_eng.csv")
  df = pd.read_csv(f"data\ODA_{prov.capitalize()}_v1.csv", dtype=dtypes).drop_duplicates(subset=["full_addr"])
  df = pd.read_csv(f"data\ODA_{prov.capitalize()}_v1.csv", dtype=dtypes).drop_duplicates(subset=["full_addr"])
  df = pd.read_csv(f"data\ODA_{prov.capitalize()}_v1.csv", dtype=dtypes).drop_duplicates(subset=["full_addr"])
  df = pd.read_csv(f"data\ODA_{prov.capitalize()}_v1.csv", dtype=dtypes).drop_duplicates(subset=["full_addr"])
  df = pd.read_csv(f"data\ODA_{prov.capitalize()}_v1.csv", dtype=dtypes).drop_duplicates(subset=["full_addr"])
  df = pd.read_csv(f"data\ODA_{prov.capitalize()}_v1.csv", dtype=dtypes).drop_duplicates(subset=["full_addr"])
  df = pd.read_csv(f"data\ODA_{prov.capitalize()}_v1.csv", dtype=dtypes).drop_duplicates(subset=["full_addr"

In [10]:
len(statcan_df)

5582851

### Augmenting Data

In [None]:
for i, row in statcan_df.iterrows():
    wt, tt = [], []
    illegal_pc = ["VACANT", "QUEENS", "RMC", "na", "UNKNOWN", "UNASSIGNED"]
    
    if prob(0.3):
        wt.append(rand_unit_num())
        tt.append("unit")
    elif prob(0.2):
        wt.append(choice([f"{rand.randint(1,9999)}", f"{rand.randint(1,99999)}"], p= [0.8, 0.2]))
        tt.append("pobox")
    elif prob(0.02):
        wt.append(rand_unit_num())
        tt.append("ph")

    row["rr"] = rand_rr() if prob(0.001) else ""
    row["postal_code"] = row["postal_code"] if row["postal_code"] not in illegal_pc else ""

    for tag in ["street_no", "str_name", "str_type", "str_dir", "rr", "city", "postal_code", "province"]:
        if isinstance(row[tag], str) and not row[tag].isspace() and not row[tag] == "":
            tt.append(tag)
            wt.append(row[tag].lower())

    if prob(0.1):
        wt.append("ca")
        tt.append("country")

    std_adr, nstd_adr = convert(wt, tt, is_en=row["province"].lower() != "qc")
    statcan_df.at[i, "non-std-address"] = nstd_adr
    statcan_df.at[i, "std-address"] = std_adr

statcan_df = statcan_df[cols]
statcan_df.head()

In [None]:
single_pc_std = []
single_pc_nstd = []

for i, row in cpcdf.iterrows():
    pc = row["postal_code"].replace("-", " ").lower()
    if prob(0.01) and len(pc) == 7:
        single_pc_std.append(pc)
        pc = pc_augment(pc, num_aug= 0.5).split()
        pc_word = choice([f"{pc[0]}{pc[1]}", f"{pc[0]} {pc[1]}", f"{pc[1]}{pc[0]}"], p=[0.3, 0.68, 0.02])
        if (len(pc_word.split())) == 2 and prob(0.1):
            pc_word = aug_split.augment(pc_word)[0]
        single_pc_nstd.append(pc_word.lower())

    wt, tt = [], []
    
    if row["province"].lower() != "qc":
        row["str_name"], row["str_type"] = fake_en.street_name().lower().split() if prob(0.995) else (rand_ordinal_street(), rand.choice(street_type))
    if row["province"].lower() == "qc":
        row["str_name"], row["str_type"] = fake_fr.street_name().lower().split() if prob(0.995) else (rand_ordinal_street(), rand.choice(street_type))
    
    row["street_no"] = f"{rand.randint(1,9999)}" if prob(0.95) else ""
    row["str_dir"] = rand.choice(street_dir) if prob(0.1) else ""
    row["rr"] = rand_rr() if prob(0.001) else ""

    if prob(0.2):
        wt.append(rand_unit_num())
        tt.append("unit")
    elif prob(0.2):
        wt.append(choice([f"{rand.randint(1,9999)}", f"{rand.randint(1,99999)}"], p= [0.8, 0.2]))
        tt.append("pobox")
    elif prob(0.02):
        wt.append(rand_unit_num())
        tt.append("ph")
    
    rel_tags = ["street_no", "str_name", "str_type", "str_dir", "rr", "city", "postal_code", "province"]
    if row["province"].lower() == "qc" and prob(0.3):
        rel_tags = ["street_no", "str_type", "str_name", "rr", "city", "postal_code", "province"]
    for tag in rel_tags:
        if isinstance(row[tag], str) and not row[tag].isspace() and not row[tag] == "":
            tt.append(tag)
            wt.append(row[tag].lower())

    if prob(0.1):
        wt.append("ca")
        tt.append("country")

    std_adr, nstd_adr = convert(wt, tt, is_en=row["province"].lower() != "qc")
    cpcdf.at[i, "non-std-address"] = nstd_adr
    cpcdf.at[i, "std-address"] = std_adr

cpcdf = cpcdf[cols]
pc_df = pd.DataFrame({"non-std-address": single_pc_nstd, "std-address": single_pc_std})
cpcdf.head()

In [None]:
import copy

pc_city_map = {
    "P0B 1A0": "Baysville",
    "P0B 1E0": "Milford Bay",
    "P0B 1G0": "Minett",
    "P0B 1J0": "Port Carling",
    "P0B 1K0": "Port Sandfield",
    "P0B 1L0": "Port Sydney",
    "P0B 1M0": "Utterson",
    "P0B 1P0": "Windermere",
    "P0H 1A0": "Arnstein",
    "P0H 1B0": "Astorville",
    "P0H 1C0": "Bear Island",
    "P0H 1E0": "Bonfield",
    "P0H 1G0": "Cache Bay",
    "P0H 1H0": "Callander",
    "P0H 1J0": "Commanda",
    "P0H 1K0": "Corbeil",
    "P0H 1L0": "Crystal Falls",
    "P0H 1M0": "Field",
    "P0H 1N0": "Golden Valley",
    "P0H 1P0": "Hornell Heights",
    "P0H 1R0": "Lavigne",
    "P0H 1S0": "Loring",
    "P0H 1T0": "Marten River",
    "P0H 1V0": "Mattawa",
    "P0H 1W0": "Nipissing",
    "P0H 1Y0": "Port Loring",
    "P0H 1Z0": "Powassan",
    "P0H 2A0": "Redbridge",
    "P0H 2C0": "River Valley",
    "P0H 2E0": "Rutherglen",
    "P0H 2H0": "Temagami",
    "P0H 2J0": "Thorne",
    "P0H 2K0": "Tilden Lake",
    "P0H 2L0": "Trout Creek",
    "P0H 2M0": "Verner",
    "P0H 2N0": "Warren",
    "P0H 2R0": "Restoule",
    "V0N 1A0":" Alert Bay",
    "V0N 1B0":" Whistler",
    "V0N 1E0":" Blubber Bay",
    "V0N 1G0":" Bowen Island",
    "V0N 1H0":" Brackendale",
    "V0N 1J0":" Britannia Beach",
    "V0N 1K0":" Coal Harbour",
    "V0N 1L0":" D'Arcy",
    "V0N 1M0":" Dawsons Landing",
    "V0N 1N0":" Egmont",
    "V0N 1P0":" Galiano Island",
    "V0N 1S0":" Garden Bay",
    "V0N 1T0":" Garibaldi Highlands",
    "V0N 1V0":" Gibsons",
    "V0N 1W0":" Gillies Bay",
    "V0N 1Z0":" Holberg",
    "V0N 2B0":" Kingcome Inlet",
    "V0N 2E0":" Lions Bay",
    "V0N 2G0":" Lund",
    "V0N 2H0":" Madeira Park",
    "V0N 2H1":" Pender Harbour",
    "V0N 2J0":" Mayne Island",
    "V0N 2K0":" Mount Currie",
    "V0N 2L0":" Pemberton",
    "V0N 2M0":" Pender Island",
    "V0N 2N0":" Port Alice",
    "V0N 2P0":" Port Hardy",
    "V0N 2R0":" Port McNeill",
    "V0N 2S0":" Port Mellon",
    "V0N 2V0":" Quatsino",
    "V0N 2W0":" Roberts Creek",
    "V0N 2Y0":" Saturna",
    "V0N 3A0":" Sechelt",
    "V0N 3B0":" Seton Portage",
    "V0N 3C0":" Shalalth",
    "V0N 3E0":" Sointula",
    "V0N 3J0":" Telegraph Cove",
    "V0N 3K0":" Van Anda",
    "V0N 3L0":" Winter Harbour",
    "V0N 3P0":" Woss",
    "V0N 3V0":" Gibsons",
    "V0N 3Z0":" Furry Creek",
    "V0R 1A0":" Ahousat",
    "V0R 1B0":" Bamfield",
    "V0R 1G0":" Bowser",
    "V0R 1H0":" Cassidy",
    "V0R 1K0":" Chemainus",
    "V0R 1L0":" Cobble Hill",
    "V0R 1M0":" Coombs",
    "V0R 1N0":" Cowichan Bay",
    "V0R 1R0":" Crofton",
    "V0R 1S0":" Cumberland",
    "V0R 1T0":" Denman Island",
    "V0R 1V0":" Errington",
    "V0R 1W0":" Fanny Bay",
    "V0R 1X0":" Gabriola",
    "V0R 1Y0":" Honeymoon Bay",
    "V0R 1Z0":" Hornby Island",
    "V0R 2B0":" Kildonan",
    "V0R 2C0":" Koksilah",
    "V0R 2G0":" Lake Cowichan",
    "V0R 2H0":" Lantzville",
    "V0R 2J0":" Lasqueti",
    "V0R 2K0":" Lazo",
    "V0R 2L0":" Malahat",
    "V0R 2M0":" Merville",
    "V0R 2N0":" Mesachie Lake",
    "V0R 2P0":" Mill Bay",
    "V0R 2V0":" Royston",
    "V0R 2W0":" Shawnigan Lake",
    "V0R 2Y0":" Thetis Island",
    "V0R 2Z0":" Tofino",
    "V0R 3A0":" Ucluelet",
    "V0R 3B0":" Union Bay",
    "V0R 3C0":" Westholme",
    "V0R 3E1":" Youbou",
    "V0R 4K0":" Chemainus",
    "V0R 5K0":" Penelakut Island",
    "L0N 1A0":" Alton",
"L0N 1B0":" Belfountain",
"L0N 1C0":" Caledon Village",
"L0N 1E0":" Caledon East",
"L0N 1G0":" Grand Valley",
"L0N 1H0":" Honeywood",
"L0N 1J0":" Horning's Mills",
"L0N 1K0":" Inglewood",
"L0N 1L0":" Laurel",
"L0N 1M0":" Mansfield",
"L0N 1N0":" Orton",
"L0N 1P0":" Palgrave",
"L0N 1R0":" Rosemont",
"L0N 1S0":" Shelburne",
"N0N 1A0":" Alvinston",
"N0N 1B0":" Brigden",
"N0N 1C0":" BrightΓÇÖs Grove",
"N0N 1E0":" Camlachie",
"N0N 1G0":" Corunna",
"N0N 1H0":" Courtright",
"N0N 1J0":" Forest",
"N0N 1K0":" Inwood",
"N0N 1M0":" Mooretown",
"N0N 1N0":" Oil City",
"N0N 1P0":" Oil Springs",
"N0N 1R0":" Petrolia",
"N0N 1T0":" Wyoming",
"A0N 1A0":" Aguathuna",
"A0N 1B0":" Barachois Brook",
"A0N 1C0":" Cape Ray",
"A0N 1E0":" Cape St. George",
"A0N 1G0":" Cartyville",
"A0N 1H0":" Codroy",
"A0N 1J0":" Doyles",
"A0N 1K0":" Grand Bay East",
"A0N 1M0":" Heatherton",
"A0N 1N0":" Highlands",
"A0N 1P0":" Jeffrey's",
"A0N 1R0":" Lourdes",
"A0N 1S0":" Noels Pond",
"A0N 1T0":" Port au Port",
"A0N 1V0":" Robinsons",
"A0N 1W0":" St. Andrew's",
"A0N 1X0":" St. David's",
"A0N 1Y0":" St. Fintan's",
"A0N 1Z0":" St. George's",
"A0N 2B0":" South Branch",
"A0N 2C0":" Stephenville Crossing",
"A0N 2E0":" West Bay Centre",
"A0N 2G0":" Black Duck Siding",
"A0N 2H0":" Burgeo",
"A0N 2J0":" Ramea",
"A0N 2K0":" Fran├ºois",
"A0N 2L0":" Grey River",
"B0N 1C0":" Brookfield",
"B0N 1E0":" Centre Burlington",
"B0N 1G0":" Cheverie",
"B0N 1H0":" Curry's Corner",
"B0N 1J0":" Densmore Mills",
"B0N 1K0":" Elderbank",
"B0N 1L0":" Ellershouse",
"B0N 1P0":" Kennetcook",
"B0N 1T0":" Maitland",
"B0N 1V0":" Meaghers Grant",
"B0N 1W0":" Micmac",
"B0N 1X0":" Middle Musquodoboit",
"B0N 1Y0":" Milford Station, Milford",
"B0N 1Z0":" Mount Uniacke",
"B0N 2A0":" Newport",
"B0N 2B0":" Newport Station",
"B0N 2C0":" Noel",
"B0N 2E0":" Ste-Croix",
"B0N 2G0":" Scotch Village",
"B0N 2H0":" Shubenacadie",
"B0N 2J0":" Stewiacke",
"B0N 2K0":" Summerville",
"B0N 2L0":" Upper Kennetcook",
"B0N 2M0":" Upper Musquodoboit",
"B0N 2N0":" Upper Rawdon",
"B0N 2P0":" Upper Stewiacke",
"B0N 2R0":" Walton",
"B0N 2T0":" Windsor",
"B0N 3A0":" Ellershouse",
"G0N 1B0":" Saint-Joseph-de-Coleraine, Saint-Julien",
"G0N 1C0":" Sainte-Clotilde-de-Beauce",
"G0N 1E0":" Disraeli",
"G0N 1E1":" Sainte-Prax├¿de",
"G0N 1E2":" Saint-Jacques-le-Majeur-de-Wolfestown",
"G0N 1G0":" Sacr├⌐-C┼ôur-de-J├⌐sus",
"G0N 1H0":" East Broughton",
"G0N 1J0":" Saint-Jacques-de-Leeds",
"G0N 1K0":" Kinnear's Mills",
"G0N 1M0":" Saint-Adrien-d'Irlande",
"G0N 1N0":" Saint-Ferdinand",
"G0N 1P0":" Saint-Fr├⌐d├⌐ric",
"G0N 1R0":" Saint-Jules",
"G0N 1S0":" Adstock",
"G0N 1T0":" Saint-Pierre-de-Broughton",
"G0N 1V0":" Saint-S├⌐verin",
"G0N 1X0":" Tring-Jonction",
"S0N 0A0":" Abbey",
"S0N 0B0":" Admiral",
"S0N 0C0":" Aneroid",
"S0N 0E0":" Blumenhof",
"S0N 0G0":" Bracken",
"S0N 0H0":" Burstall",
"S0N 0J0":" Cabri",
"S0N 0K0":" Cadillac",
"S0N 0M0":" Claydon",
"S0N 0N0":" Climax",
"S0N 0P0":" Consul",
"S0N 0S0":" Dollard",
"S0N 0T0":" Eastend",
"S0N 0V0":" Fox Valley",
"S0N 0W0":" Frontier",
"S0N 0X0":" Glenbain",
"S0N 0Y0":" Golden Prairie",
"S0N 1A0":" Gull Lake",
"S0N 1C0":" Hazenmore",
"S0N 1E0":" Hazlet",
"S0N 1G0":" Lancer",
"S0N 1H0":" Leader",
"S0N 1L0":" Liebenthal",
"S0N 1M0":" McMahon",
"S0N 1N0":" Maple Creek",
"S0N 1P0":" Mendham",
"S0N 1S0":" Neidpath",
"S0N 1T0":" Neville",
"S0N 1V0":" Orkney",
"S0N 1W0":" Pambrun",
"S0N 1X0":" Pennant Station",
"S0N 1Y0":" Piapot",
"S0N 1Z0":" Ponteix",
"S0N 2A0":" Portreeve",
"S0N 2B0":" Prelate",
"S0N 2E0":" Richmound",
"S0N 2G0":" Robsart",
"S0N 2H0":" Sceptre",
"S0N 2L0":" Shackleton",
"S0N 2M0":" Shaunavon",
"S0N 2N0":" Simmie",
"S0N 2P0":" Stewart Valley",
"S0N 2R0":" Success",
"S0N 2S0":" Tompkins",
"S0N 2T0":" Val Marie",
"S0N 2V0":" Vanguard",
"S0N 2W0":" Vidora",
"S0N 2X0":" Webb",
"S0N 2Y0":" Wymark",
}

pc_to_prov = {
    "a" : "nl",
    "b" : "ns",
    "g" : "qc",
    "h" : "qc",
    "j" : "qc",
    "k" : "on",
    "l" : "on",
    "m" : "on",
    "n" : "on",
    "p" : "on",
    "r" : "mb",
    "s" : "sk",
    "t" : "ab",
    "v" : "bc"
}

std = []
nstd = []

for k in pc_city_map.keys():
    pc = k.lower()
    for i in range(1000):
        city =  pc_city_map[k].lower()

        wt,tt = [], []
        
        if prob(0.1):
            wt.append(rand_unit_num())
            tt.append("unit")
        elif prob(0.5):
            wt.append(f"{rand.randint(1,9999)}")
            tt.append("pobox")

        wt.append(f"{rand.randint(1,9999)}")
        tt.append("street_no")

        if prob(0.995):
            sn, st = fake_en.street_name().lower().split()
            wt.append(sn)
            tt.append("str_name")
            wt.append(st)
            tt.append("str_type")
        else:
            wt.append(rand_ordinal_street())
            tt.append("str_name")
            wt.append(rand.choice(street_type))
            tt.append("str_type")

        if prob(0.1):
            wt.append(rand.choice(street_dir))
            tt.append("str_dir")

        wt.append(city)
        tt.append("city")

        wt.append(pc)
        tt.append("postal_code")

        wt.append(pc_to_prov[pc[0]])
        tt.append("province")
        
        if prob(0.4):
            wt.append("ca")
            tt.append("country")
        
        tmp_wt = copy.deepcopy(wt)
        tmp_tt = copy.deepcopy(tt)
        std_adr, nstd_adr = convert(wt, tt, is_en=pc_to_prov[pc[0]] != "qc")
                                
        std.append(std_adr)
        nstd.append(nstd_adr)

popc_df = pd.DataFrame({"non-std-address": nstd, "std-address": std})

In [None]:
statcan_df=statcan_df.reindex(columns=["non-std-address", "std-address"])
cpcdf=cpcdf.reindex(columns=["non-std-address", "std-address"])
all_df = pd.concat([statcan_df, cpcdf, pc_df, popc_df, city_df], ignore_index=True)
all_df = all_df[all_df['non-std-address'].str.len() > 0]
all_df.to_csv("addr_std_training_data_11.csv", index=False)

In [None]:
len(popc_df)

In [None]:
len(statcan_df) + len(cpcdf) + len(pc_df) + len(popc_df)

In [None]:
city_df.head()

In [None]:
for i in range(20):
    print(pc_augment("l0r 01a", letter_aug = 0.7, num_aug = 0.5))
    # print(choice(conf_matrix.columns, p=conf_matrix.loc['0']))

In [None]:
lst = [1,23]
lst_2 = lst
lst.append(4)
print(lst_2)