### Install the required modules and import them

In [None]:
%pip install translitcodec
#%pip install codecs
%pip install pandas
%pip install numpy
%pip install number-parser

In [3]:
import translitcodec
import codecs
from number_parser import parse
import pandas as pd
import numpy as np
import re


### Prepare a standardization function
- Trim spaces
- Transliterate to ASCII/Latin
- Convert to Lowercase

In [None]:
#---------------------------------------------------------------------------------------------------
def Standardize_Text(x):
    """Standardize text by trimming, transliterating, and converting to lowercase"""
    return codecs.encode(x.strip(),"translit/short").lower()
#---------------------------------------------------------------------------------------------------

### Normalize Countries
- Strip spaces, periods, parentheses
- Map to a country dictionary

In [None]:
#---------------------------------------------------------------------------------------------------
country_canon = {"afghanistan":"af","albania":"al","algeria":"dz","americansamoa":"as","andorra":"ad","angola":"ao","anguilla":"AI",
    "antarctica":"AQ","antigua":"AG","barbuda":"AG","antiguaandbarbuda":"AG","argentina":"AR","armenia":"AM","aruba":"AW",
    "australia":"AU","austria":"AT","azerbaijan":"AZ","bahamas":"BS","bahrain":"BH","bangladesh":"BD","barbados":"BB","belarus":"BY",
    "belgium":"BE","belize":"BZ","benin":"BJ","bermuda":"BM","bhutan":"BT","bolivia":"BO","boliviaplurinationalstateof":"BO",
    "bonaire":"BQ","sinteustatius":"BQ","saba":"BQ","bonaire, sinteustatiusandsaba":"BQ","bosnia":"BA","bosniaandherzegovina":"BA",
    "herzegovina":"BA","botswana":"BW","bouvetisland":"BV","brazil":"BR","britishindianoceanterritory":"IO","brunei":"BN",
    "bruneidarussalam":"BN","bulgaria":"BG","burkinafaso":"BF","burundi":"BI","cambodia":"KH","cameroon":"CM","canada":"CA",
    "capeverde":"CV","caymanislands":"KY","centralafricanrepublic":"CF","chad":"TD","chile":"CL","china":"CN","christmasisland":"CX",
    "cocosislands":"CC","keelingislands":"CC","cocoskeelingislands":"CC","colombia":"CO","comoros":"KM","congo":"CG",
    "congo, the democratic republic of the":"CD","cook islands":"CK","costa rica":"CR","cotedivoire":"CI","ivorycoast":"CI","croatia":"HR",
    "cuba":"CU","curacao":"CW","cyprus":"CY","czechrepublic":"CZ","denmark":"DK","djibouti":"DJ","dominica":"DM",
    "dominicanrepublic":"DO","ecuador":"EC","egypt":"EG","elalvador":"SV","equatorialguinea":"GQ","eritrea":"ER","estonia":"EE",
    "ethiopia":"ET","falklandislands":"FK","malvinas":"FK","falklandislandsmalvinas":"FK","faroeislands":"FO","fiji":"FJ",
    "finland":"FI","france":"FR","frenchguiana":"GF","frenchpolynesia":"PF","frenchsouthernterritories":"TF","gabon":"GA",
    "gambia":"GM","georgia":"GE","germany":"DE","ghana":"GH","gibraltar":"GI","greece":"GR","greenland":"GL","grenada":"GD",
    "guadeloupe":"GP","guam":"GU","guatemala":"GT","guernsey":"GG","guinea":"GN","guineabissau":"GW","guyana":"GY",
    "haiti":"HT","heardisland":"HM","mcdonaldislands":"HM","heardislandandmcdonaldislands":"HM","holysee":"VA",
    "vaticancitystate":"VA","vaticancity":"VA","holyseevaticancitystate":"VA","honduras":"HN","hongkong":"HK","hungary":"HU",
    "iceland":"IS","india":"IN","indonesia":"ID","iran":"IR","iranislamicrepublicof":"IR","iraq":"IQ","ireland":"IE","isleofman":"IM",
    "israel":"IL","italy":"IT","jamaica":"JM","japan":"JP","jersey":"JE","jordan":"JO","kazakhstan":"KZ","kenya":"KE","kiribati":"KI",
    "nkorea":"KP","northkorea":"KP","koreademocraticpeoplesrepublicof":"KP","korea":"KR","skorea":"KR","southkorea":"KR",
    "korearepublicof":"KR","kuwait":"KW","kyrgyzstan":"KG","lao":"LA","laopeoplesdemocraticrepublic":"LA","latvia":"LV",
    "lebanon":"LB","lesotho":"LS","liberia":"LR","libya":"LY","liechtenstein":"LI","lithuania":"LT","luxembourg":"LU","macau":"MO",
    "macao":"MO","macedonia":"MK","macedoniarepublicof":"MK","madagascar":"MG","malawi":"MW","malaysia":"MY","maldives":"MV",
    "mali":"ML","malta":"MT","marshallislands":"MH","martinique":"MQ","mauritania":"MR","mauritius":"MU","mayotte":"YT","mexico":"MX",
    "micronesia":"FM","micronesiafederatedstatesof":"FM","moldova":"MD","moldovarepublicof":"MD","monaco":"MC","mongolia":"MN",
    "montenegro":"ME","montserrat":"MS","morocco":"MA","mozambique":"MZ","myanmar":"MM","namibia":"NA","nauru":"NR","nepal":"NP",
    "nederlands":"NL","thenetherlands":"NL","holland":"NL","netherlands":"NL","newcaledonia":"NC","newzealand":"NZ","nicaragua":"NI",
    "niger":"NE","nigeria":"NG","niue":"NU","norfolkisland":"NF","nmarianaislands":"MP","northernmarianaislands":"MP","norway":"NO",
    "oman":"OM","pakistan":"PK","palau":"PW","palestine":"PS","palestinianterritory":"PS","palestinianterritoryoccupied":"PS",
    "panama":"PA","papuanewguinea":"PG","paraguay":"PY","peru":"PE","philippines":"PH","pitcairn":"PN","poland":"PL","portugal":"PT",
    "puertorico":"PR","qatar":"QA","reunion":"RE","romania":"RO","russia":"RU","russianfederation":"RU","rwanda":"RW",
    "stbarthelemy":"BL","saintbarthelemy":"BL","sainthelena":"SH","sthelena":"SH","ascension":"SH","tristandacunha":"SH",
    "sainthelenaascensionandtristandacunha":"SH","stkitts":"KN","saintkitts":"KN","nevis":"KN","saintkittsandnevis":"KN",
    "stlucia":"LC","saintlucia":"LC","saintmartin":"MF","stmartin":"MF","saintmartinfrenchpart":"MF","st pierre":"PM",
    "saintpierre":"PM","miquelon":"PM","saintpierreandmiquelon":"PM","stvincentandthegrenadines":"VC",
    "saintvincentandthegrenadines":"VC","samoa":"WS","sanmarino":"SM","saotomeandprincipe":"ST","saudiarabia":"SA","senegal":"SN",
    "serbia":"RS","seychelles":"SC","sierraleone":"SL","singapore":"SG","stmaarten":"SX","sintmaarten":"SX",
    "sintmaartendutchpart":"SX","slovakia":"SK","slovenia":"SI","solomonislands":"SB","somalia":"SO","safrica":"ZA",
    "southafrica":"ZA","sgeorgia":"GS","southgeorgia":"GS","ssandwichislands":"GS","southsandwichislands":"GS",
    "southgeorgiaandthesouthsandwichislands":"GS","spain":"ES","srilanka":"LK","sudan":"SD","suriname":"SR","ssudan":"SS",
    "southsudan":"SS","svalbard":"SJ","janmayen":"SJ","svalbardandjanmayen":"SJ","swaziland":"SZ","sweden":"SE","switzerland":"CH",
    "syria":"SY","syrian arab republic":"SY","taiwan":"TW","taiwan, province of china":"TW","tajikistan":"TJ","tanzania":"TZ",
    "tanzaniaunitedrepublicof":"TZ","thailand":"TH","timorleste":"TL","togo":"TG","tokelau":"TK","tonga":"TO","trinidad":"TT",
    "tobago":"TT","trinidadandtobago":"TT","tunisia":"TN","turkiye":"TR","turkey":"TR","turkmenistan":"TM",
    "turksandcaicosislands":"TC","tuvalu":"TV","uganda":"UG","ukraine":"UA","uae":"AE","unitedarabemirates":"AE",
    "unitedkingdom":"GB","uk":"GB","gbr":"uk","greatbritain":"GB","england":"GB","scotland":"GB","wales":"GB","northernireland":"GB",
    "nireland":"GB","unitedstates":"US","usa":"US","unitedstatesofamerica":"US","usminoroutlyingislands":"UM",
    "unitedstatesminoroutlyingislands":"UM","uruguay":"UY","uzbekistan":"UZ","vanuatu":"VU","venezuela":"VE",
    "venezuelabolivarianrepublicof":"VE","vietnam":"VN","britishvirginislands":"VG","bvi":"VG",
    "virginislandsbritish":"VG","usvirginislands":"VI","usvi":"VI","virginislands":"VI","wallisandfutuna":"WF",
    "yemen":"YE","zambia":"ZM","zimbabwe":"ZW"}
#---------------------------------------------------------------------------------------------------
# Remove commas, spaces, parentheses, periods, dashes, and apostrophes
CountryTrans = str.maketrans(r"",r"",r",. ()-'") 
#---------------------------------------------------------------------------------------------------
def Normalize_Country( country, default_country='us' ):
    """Normalization process for countries, converts to their ISO-3166-2 code. Defaults to 'us'"""
    stripped_country = country.translate(CountryTrans)
    mapped_country = country_canon.get(stripped_country,country)
    return (mapped_country or default_country).lower()
#---------------------------------------------------------------------------------------------------

In [None]:
Normalize_Country('uzbekistan')

### Normalize States
- Remove periods, spaces
- Convert from dictionary

In [None]:
#---------------------------------------------------------------------------------------------------
dict_States = {"alabama":"al","alaska":"ak","arizona":"az","arkansas":"ar","california":"ca","colorado":"co","connecticut":"ct","delaware":"de",
    "florida":"fl","georgia":"ga","hawaii":"hi","idaho":"id","illinois":"il","indiana":"in","iowa":"ia","kansas":"ks","kentucky":"ky",
    "louisiana":"la","maine":"me","maryland":"md","massachusetts":"ma","michigan":"mi","minnesota":"mn","mississippi":"ms","missouri":"mo",
    "montana":"mt","nebraska":"ne","nevada":"nv","newhampshire":"nh","newjersey":"nj","newmexico":"nm","newyork":"ny","northcarolina":"nc",
    "northdakota":"nd","ohio":"oh","oklahoma":"ok","oregon":"or","pennsylvania":"pa","rhodeisland":"ri","southcarolina":"sc",
    "southdakota":"sd","tennessee":"tn","texas":"tx","utah":"ut","vermont":"vt","virginia":"va","washington":"wa","westvirginia":"wv",
    "wisconsin":"wi","wyoming":"wy","districtofcolumbia":"dc","americansamoa":"as","guam":"gu","northernmarianaislands":"mp",
    "puertorico":"pr","unitedstatesminoroutlyingislands":"um","usvirginislands":"vi",
    "ncarolina":"nc","ndakota":"nd","scarolina":"sc","sdakota":"sd","wvirginia":"wv",
    "ontario":"on","quebec":"qc","novascotia":"ns","newbrunswick":"nb","manitoba":"mb","britishcolumbia":"bc","princeedwardisland":"pe",
    "saskatchewan":"sk","alberta":"ab","newfoundlandandlabrador":"nl",
    "qb":"qc","pq":"qc","newfoundland":"nl","labrador":"nl","ont":"on"
}
dict_State_Values = dict_States.values()
#---------------------------------------------------------------------------------------------------
StateTrans = str.maketrans(r"",r"",r". ") 
#---------------------------------------------------------------------------------------------------
def Normalize_State( state ):
    stripped_state = state.translate(StateTrans)
    mapped_state = stripped_state if stripped_state in dict_State_Values else dict_States.get(stripped_state,state)
    return (mapped_state or state).lower()
#---------------------------------------------------------------------------------------------------

In [None]:
Normalize_State("ont.")

### Normalize postcodes
* strip spaces, dashes

In [None]:
#---------------------------------------------------------------------------------------------------
postcode_translations = str.maketrans(r"",r"",r"- ")
#---------------------------------------------------------------------------------------------------
def Normalize_Postcodes( postcode ):
    stripped_postcode = postcode.translate( postcode_translations )
    return stripped_postcode
#---------------------------------------------------------------------------------------------------

### Normalize address lines
- Strip characters
- parse to convert number words to numbers
- regex to remove ordinals
- tokenize to words
- remove stop words
- abbreviate

In [23]:
### Normalize Address lines
#---------------------------------------------------------------------------------------------------
address_line_translations = str.maketrans(  r"().,-:",\
                                            r"      ",\
                                            r"")
#---------------------------------------------------------------------------------------------------
address_dict={"avenue":"ave","floor":"fl","north":"n","terrace":"terr"} 
address_stopwords=["the","and","of","c/o","attention","attn"]
#---------------------------------------------------------------------------------------------------
def Normalize_Address_Line( address ):
    tokens = re.sub(r"(?<=[0-9])(?:st|nd|rd|th)","",parse( address.translate( address_line_translations ) ),0, re.MULTILINE ).split()
    tokens2 = [address_dict.get(token, token) for token in tokens]
    tokens3 = [token for token in tokens2 if token not in address_stopwords]
    print(f"{0} -> {1} -> {2} -> {3}", address, tokens, tokens2, tokens3)
    return ' '.join(tokens3)
#    return ' '.join([token for token in tokens if token not in address_stopwords])

#---------------------------------------------------------------------------------------------------
#---------------------------------------------------------------------------------------------------
#---------------------------------------------------------------------------------------------------


In [28]:
#Normalize_Address_Line("757 third avenue, the 3rd floor")
Normalize_Address_Line("543 North Terrace Avenue")
#Normalize_Address_Line("757 3rd ave third floor")


'543 North Terrace Avenue'