In [1]:
import pandas as pd
import numpy as np
import regex as re
import warnings

warnings.filterwarnings('ignore')

In [2]:
class Street:
    def street_direction(x: str) -> str:
        """
        Introduction:
        -------------
        This function can help users to clean the street directions in the street addresses. Specifically, turn different formats of directions to abbreviations.

        Inputs:
        -------------
        'x': str. The street addresses in your dataset.

        Outputs:
        -------------
        'x': str. Street addresses with abbreviated types.

        Example:
        -------------
        >>> e = street_direction('EAST 14TH STREET')
        >>> e
        'E 14TH STREET'
        """ 

        # The orientations in the addresses
        pattern = re.compile(r'\sN\s|\sNORTH\s')
        x = re.sub(pattern, " N ", x)
        pattern = re.compile(r'\sNORTH$')
        x = re.sub(pattern, " N", x)
        pattern = re.compile(r'^NORTH\s')
        x = re.sub(pattern, "N ", x)
        pattern = re.compile(r'\sS\s|\sSOUTH\s')
        x = re.sub(pattern, " S ", x)
        pattern = re.compile(r'\sSOUTH$')
        x = re.sub(pattern, " S", x)
        pattern = re.compile(r'^SOUTH\s')
        x = re.sub(pattern, "S ", x)
        pattern = re.compile(r'\sE\s|\sEAST\s')
        x = re.sub(pattern, " E ", x)
        pattern = re.compile(r'\sEAST$')
        x = re.sub(pattern, " E", x)
        pattern = re.compile(r'^EAST\s')
        x = re.sub(pattern, "E ", x)
        pattern = re.compile(r'\sW\s|\sWEST\s')
        x = re.sub(pattern, " W ", x)
        pattern = re.compile(r'\sWEST$')
        x = re.sub(pattern, " W", x)
        pattern = re.compile(r'^WEST\s')
        x = re.sub(pattern, "W ", x)

        return x

    def street_type(x: str) -> str:
        """
        Introduction:
        -------------
        This function can help users to clean the street types in the street addresses. Specifically, turn different formats of street types to abbreviations.

        Inputs:
        -------------
        'x': str. The street addresses in your dataset.

        Outputs:
        -------------
        'x': str. Street addresses with abbreviated street types.

        Example:
        -------------
        >>> e = street_type('E 14TH STREET')
        >>> e
        'E 14TH ST'
        """

        pattern = re.compile(r'^(\W+\s)')
        x = re.sub(pattern, "", x)
        pattern = re.compile(r'\sSTREET|\sSTREE$|\sSTR|\sSTE$|\sSRT$|\sSR$|\sSST$|\sSEET$|\sTREET$|\sSHEER$|\sSHEE$|\sSTREE$|\sSREET$|\sREET$|\sSTEE$|\sST$|\sS$')
        x = re.sub(pattern, " ST", x)
        pattern = re.compile(r'\sDRIVE$|\sDRV$|\sDRI$|\sDRIV$|\sDRIE$|\sDE$|\sDV$|\sDR$')
        x = re.sub(pattern, " DR", x)
        pattern = re.compile(r'\sCIRCLE$|\sCIRCL$|\sCICLE$|\sCIRC$|\sCIR$|\sCRL$|\sCL$|\sCR$')
        x = re.sub(pattern, " CIR", x)
        pattern = re.compile(r'\sAVENUE$|\sAVENUE|\sAVENU$|\sAVEN$|\sAVE$|\sAVN$|\sAV$')
        x = re.sub(pattern, " AVE", x)
        pattern = re.compile(r'^AVENUE\s|^(\W+)\sAV\s')
        x = re.sub(pattern, "AVE ", x)
        pattern = re.compile(r"\sCOURT$|\sCT$|\sCRT$|\sCTR$|\sCOUR$|<=\sCOT$|\sCORT$")
        x = re.sub(pattern, " CT", x)
        pattern = re.compile(r"\sBOULEVARD$|\sBVLD|\sBL.$|\sB.$")
        x = re.sub(pattern, " BLVD", x)
        pattern = re.compile(r"\sROAD$|\sRD$|\sRAD$|\sROD$")
        x = re.sub(pattern, " RD", x)
        pattern = re.compile(r"\sALLEY$|\sALY$|\sALEY$|\sALL.$|\sAL$|\sAY$")
        x = re.sub(pattern, " ALY", x)
        pattern = re.compile(r"\sPLACE$|\sPL.$|\sP.$|\sPLAC$|\sPLCE$|\sPCE$")
        x = re.sub(pattern, " PL", x)
        pattern = re.compile(r"\sPK$|\sPRK$|\sPRAK$|\sPAK$")
        x = re.sub(pattern, " PARK", x)
        pattern = re.compile(r"\sPARKWAY$|\sPKWY$|\sPARKW$|\sPWY$|\sPKW$|\sPRKWY$|\sPKW$")
        x = re.sub(pattern, " PKWY", x)
        pattern = re.compile(r"\sAPPROA$|\sAPRCH$|\sAPPRCH$|\sAPPR$|\sAPR$")
        x = re.sub(pattern, " APPROACH", x)
        pattern = re.compile(r"\sTERRACE$|\sTERR$|\sTER$|\sTRCE$|\sTRC$|\sTR$")
        x = re.sub(pattern, " TER", x)
        pattern = re.compile(r"\sPLAZA$|\sPLZA$|\sPLZ$|\sPLAZ$|\sPZ$")
        x = re.sub(pattern, " PLZ", x)
        pattern = re.compile(r"\sLANE$|\sLNE$|\sLN$|\sLAN$")
        x = re.sub(pattern, " LN", x)
        pattern = re.compile(r"\sBRIDGE$|\sBRGD$|\sBRG$|\sBGE$")
        x = re.sub(pattern, " BRG",x)
        pattern = re.compile(r"\sHILL$|\sHLL$|\sHL$|\sHIL$")
        x = re.sub(pattern, " HL", x)
        pattern = re.compile(r"\sHEIGHTS$|\sHTS$|\sHT$|\sHEGHTS$|\sHEIGHT$|\sHHT$|\sHEIGT$") 
        x = re.sub(pattern, " HTS", x)
        pattern = re.compile(r"\sSLP$|\sSLEP$|\sSLIIP$|\sSLI$")
        x = re.sub(pattern, " SLIP", x)
        pattern = re.compile(r"\sROOW$|\sRO.$|\sRW$")
        x = re.sub(pattern, " ROW", x)
        pattern = re.compile(r"\sSQUARE$") 
        x = re.sub(pattern, " SQ", x)
        pattern = re.compile(r"\sSQUARE\s") 
        x = re.sub(pattern, " SQ ", x)

        return x

    def split(x: str) -> str:
        """
        Introduction:
        -------------
        This function can help users to seperate the house number and the street addresses and return the street addresses.

        Inputs:
        -------------
        'x': str. The street addresses in your dataset.

        Outputs:
        -------------
        'x': str. Street addresses without house number.

        Example:
        -------------
        >>> e = split('609 EAST 14TH STREET')
        >>> e
        'EAST 14TH STREET'
        """

        pattern_parentheses = re.compile(r'\([^)]*\)')
        x = re.sub(pattern_parentheses, '', x)
        pattern_hn = re.compile(r'^\d+\s')
        x = re.sub(pattern_hn, '', x)
        pattern_hn = re.compile(r'^\d+R\s')
        x = re.sub(pattern_hn, '', x)

        return x

    def street_name(x: str) -> str:
        """
        Introduction:
        -------------
        This function can help users to correct the street names in the street addresses. Some street names are different from past in the geospitial dataset. Therefore, we need to correct the street names to make them conform to the present street names.

        Inputs:
        -------------
        'x': str. The street addresses in your dataset.

        Outputs:
        -------------
        'x': str. Street addresses with correct street names.

        Example:
        -------------
        >>> e = street_name('BATTIE AVE')
        >>> e
        'BALTIC AVE'
        """

        pattern = re.compile(r'(?<=\s)ALLANTIC\s|(?<=\s)ATLASTA\s')
        x = re.sub(pattern, "ATLANTIC", x)
        pattern = re.compile(r'(?<=\s)ALLEM\s')
        x = re.sub(pattern, "ALLEN", x)    

        pattern = re.compile(r'(?<=\s)CROTON\s|(?<=\s)GROTON\s')
        x = re.sub(pattern, "AUDUBON", x)    
        pattern = re.compile(r'(?<=\s)(AT RINS)\s')
        x = re.sub(pattern, "ATKINS", x)  
        pattern = re.compile(r'(?<=\s)AMSTERDAM\s')
        x = re.sub(pattern, "AMSTERDAM", x) 

        pattern = re.compile(r'(?<=\s)BATTIE\s')
        x = re.sub(pattern, "BALTIC", x)      
        pattern = re.compile(r'(?<=\s)BARREE\s')
        x = re.sub(pattern, "BARROW", x)   
        pattern = re.compile(r'(?<=\s)BESSHLEY\s')
        x = re.sub(pattern, "BURLING", x)       
        pattern = re.compile(r'(?<=\s)BIRY\s|(?<=\s)(B WAY)\s|(?<=\s)BWAY\s|(?<=\s)BRAKSWAY\s')
        x = re.sub(pattern, "BROADWAY", x)         
        pattern = re.compile(r'(?<=\s)BUSTWICK\s')
        x = re.sub(pattern, "BUSHWICK", x) 
        pattern = re.compile(r'(?<=\s)BUTTER\s')
        x = re.sub(pattern, "BUTLER", x) 
        pattern = re.compile(r'(?<=\s)BREEVORT\s')
        x = re.sub(pattern, "BREVOORT", x) 
        pattern = re.compile(r'(?<=\s)BRENNEL\s|(?<=\s)BROOMES\s|(?<=\s)BROOM\s|(?<=\s)BRANNAS\s|(?<=\s)BROWN\s')
        x = re.sub(pattern, "BROOME", x) 
        pattern = re.compile(r'(?<=\s)BLACKER\s|(?<=\s)BLENKER\s')
        x = re.sub(pattern, "BLEECKER", x) 

        pattern = re.compile(r'(?<=\s)CLAIR\s')
        x = re.sub(pattern, "CLASSON", x) 
        pattern = re.compile(r'(?<=\s)CLISTEN\s')
        x = re.sub(pattern, "CLINTON", x)     
        pattern = re.compile(r'(?<=\s)CHERY\s')
        x = re.sub(pattern, "CHERRY", x) 
        pattern = re.compile(r'(?<=\s)CHRYSTEE\s|(?<=\s)CHRYSTAL\s|(?<=\s)CHTYSTIE\s|(?<=\s)CHRYSTEL\s')
        x = re.sub(pattern, "CHRYSTIE", x)    
        pattern = re.compile(r'(?<=\s)CENTRAL PARK\s')
        x = re.sub(pattern, "CENTRAL PARK", x) 
        pattern = re.compile(r'(?<=\s)CARRAL\s|(?<=\s)COYAL\s')
        x = re.sub(pattern, "CANAL", x)    
        pattern = re.compile(r'(?<=\s)COLUMBIN\s')
        x = re.sub(pattern, "COLUMBIA", x) 
        pattern = re.compile(r'(?<=\s)CAMNON\s')
        x = re.sub(pattern, "CANNON", x) 
        pattern = re.compile(r'(?<=\s)CROWH\s')
        x = re.sub(pattern, "CROWN", x) 

        pattern = re.compile(r'(?<=\s)DEVAL\s')
        x = re.sub(pattern, "DEVOE", x)
        pattern = re.compile(r'(?<=\s)DEBOUCHEL\s')
        x = re.sub(pattern, "DEBEVOISE",x)
        pattern = re.compile(r'(?<=\s)DAFONT\s')
        x = re.sub(pattern, "DUPONT", x)
        pattern = re.compile(r'(?<=\s)DEGRAN\s')
        x = re.sub(pattern, "DEGRAW", x)
        pattern = re.compile(r'(?<=\s)DENBO\s|(?<=\s)DEKALB\s')
        x = re.sub(pattern, "DE KALB", x)
        pattern = re.compile(r'(?<=\s)DELAMERE\s|(?<=\s)DALANEY\s|(?<=\s)DELANEY\s|(?<=\s)DELANCY\s')
        x = re.sub(pattern, "DELANCEY", x) 

        pattern = re.compile(r'(?<=\s)ELTHZROTH\s|(?<=\s)ELLSWICK\s')
        x = re.sub(pattern, "ELLIOTT", x)
        pattern = re.compile(r'(?<=\s)ELDREDGE\s|(?<=\s)CLARIDGE\s')
        x = re.sub(pattern, "ELDRIDGE", x) 
        pattern = re.compile(r'(?<=\s)ESSEY\s')
        x = re.sub(pattern, "ESSEX", x) 

        pattern = re.compile(r'(?<=\s)FORSYTHE\s')
        x = re.sub(pattern, "FORSYTH", x) 
        pattern = re.compile(r'(?<=\s)FLATHISH\s')
        x = re.sub(pattern, "FLATBUSH", x)

        pattern = re.compile(r'(?<=\s)GLANCE\s')
        x = re.sub(pattern, "GRAND", x) 
        pattern = re.compile(r'(?<=\s)GOAST\s')
        x = re.sub(pattern, "GOERCK", x)
        pattern = re.compile(r'(?<=\s)GREENS\s')
        x = re.sub(pattern, "GREENE", x)
        pattern = re.compile(r'(?<=\s)GREENRICH\s|(?<=\s)GAMWICH\s')
        x = re.sub(pattern, "GREENWICH", x) 

        pattern = re.compile(r'(?<=\s)HOUTON\s')
        x = re.sub(pattern,  "HOUSTON", x) 
        pattern = re.compile(r'(?<=\s)HAVES\s')
        x = re.sub(pattern, "HEWES", x) 
        pattern = re.compile(r'(?<=\s)HAKEY\s')
        x = re.sub(pattern, "HALSEY", x)
        pattern = re.compile(r'(?<=\s)HEWEY\s')
        x = re.sub(pattern, "HENRY", x)
        pattern = re.compile(r'(?<=\s)HICK\s')
        x = re.sub(pattern, "HICKS", x)
        pattern = re.compile(r'(?<=\s)HUMBOLOT\s|(?<=\s)HUMBARD\s|(?<=\s)HUMBOLT\s')
        x = re.sub(pattern, "HUMBOLDT", x) 

        pattern = re.compile(r'(?<=\s)JOHOM\s')
        x = re.sub(pattern, "JOHNS", x) 

        pattern = re.compile(r'(?<=\s)KIOP\s|(?<=\s)HEAP\s')
        x = re.sub(pattern, "KEAP", x) 

        pattern = re.compile(r'(?<=\s)(LAY FAY ESTE)\s|(?<=\s)LADORATT\s|(?<=\s)LAFYAYETTE\s')
        x = re.sub(pattern, "LAFAYETTE", x)
        pattern = re.compile(r'(?<=\s)LIRA\s|(?<=\s)LOUMOR\s|(?<=\s)LARMER\s')
        x = re.sub(pattern, "LORIMER", x)
        pattern = re.compile(r'(?<=\s)LAAVIUK\s')
        x = re.sub(pattern, "LAWRENCE", x) 
        pattern = re.compile(r'(?<=\s)LAIDLOW\s')
        x = re.sub(pattern, "LUDLOW", x) 
        pattern = re.compile(r'(?<=\s)TEX|LEX\s') # perl = True
        x = re.sub(pattern, "LEXINGTON", x)   
        pattern = re.compile(r'(?<=\s)REPPERTS\s')
        x = re.sub(pattern, "LEFFERTS", x)

        pattern = re.compile(r'(?<=\s)PARLE\s|(?<=\s)MALLE\s|(?<=\s)MYETTE\s')
        x = re.sub(pattern, "MYRTLE", x)
        pattern = re.compile(r'(?<=\s)(MC DOUGALL)\s|(?<=\s)(MC DOUGAL)\s|(?<=\s)MCDOUGALL\s')
        x = re.sub(pattern, "MCDOUGAL", x)
        pattern = re.compile(r'(?<=\s)(MC DONOUGH)\s')
        x = re.sub(pattern, "MCDONOUGH", x)
        pattern = re.compile(r'(?<=\s)MANZA\s|(?<=\s)MAREY\s')
        x = re.sub(pattern, "MARCY", x)
        pattern = re.compile(r'(?<=\s)MADISON\s')
        x = re.sub(pattern, "MADISON", x)
        pattern = re.compile(r'(?<=\s)MESCOLE\s')
        x = re.sub(pattern, "MESEROLE", x)
        pattern = re.compile(r'(?<=\s)MEASE\s')
        x = re.sub(pattern, "MOORE", x)
        pattern = re.compile(r'(?<=\s)MEDDLER\s')
        x = re.sub(pattern, "MIDDLETON", x)
        pattern = re.compile(r'(?<=\s)MANGEN\s')
        x = re.sub(pattern, "MANGIN", x)
        pattern = re.compile(r'(?<=\s)HAULL\s|(?<=\s)MALLERY\s')
        x = re.sub(pattern, "MULBERRY", x)

        pattern = re.compile(r'(?<=\s)NAPOLK\s')
        x = re.sub(pattern, "NORFOLK", x)
        pattern = re.compile(r'(?<=\s)(VAST AND)\s')
        x = re.sub(pattern, "NOSTRAND", x)

        pattern = re.compile(r'(?<=\s)DAK\s')
        x = re.sub(pattern, "OAK", x)
        pattern = re.compile(r'(?<=\s)OLWEN\s')
        x = re.sub(pattern, "OLIVER", x)
        pattern = re.compile(r'(?<=\s)GERHARD\s')
        x = re.sub(pattern, "ORCHARD", x)

        pattern = re.compile(r'(?<=\s)PUTT\s')
        x = re.sub(pattern, "PITT", x)
        pattern = re.compile(r'(?<=\s)PERROTT\s|(?<=\s)(PERROTT PREMPONT)\s')
        x = re.sub(pattern, "PIERREPONT", x)
        pattern = re.compile(r'(?<=\s)PLAD\s')
        x = re.sub(pattern, "PLACE", x)
        pattern = re.compile(r'(?<=\s)PRUFER\s')
        x = re.sub(pattern, "PROSPECT", x)
        pattern = re.compile(r'(?<=\s)PREDIDUNT\s')
        x = re.sub(pattern, "PRESIDENT", x)
        pattern = re.compile(r'(?<=\s)PALOKA\s')
        x = re.sub(pattern, "PULASKI", x)

        pattern = re.compile(r'(?<=\s)RUTHIE\s')
        x = re.sub(pattern, "RUTLEDGE", x)
        pattern = re.compile(r'(?<=\s)RIDAL\s')
        x = re.sub(pattern, "RIDGE", x)
        pattern = re.compile(r'(?<=\s)RAYSON\s')
        x = re.sub(pattern, "RYERSON", x)
        pattern = re.compile(r'(?<=\s)REVENTON\s')
        x = re.sub(pattern, "RIVINGTON", x)
        pattern = re.compile(r'(?<=\s)RUALMAINE\s|(?<=\s)(RICER SIDE)\s')
        x = re.sub(pattern, "RIVERSIDE", x)
        pattern = re.compile(r'(?<=\s)REDERICK\s|(?<=\s)RENNICK\s')
        x = re.sub(pattern, "RENWICK", x)

        pattern = re.compile(r'(?<=\s)SELLTOWN\s')
        x = re.sub(pattern, "SULLIVAN", x)
        pattern = re.compile(r'(?<=\s)SISH\s')
        x = re.sub(pattern, "SIDE", x)
        pattern = re.compile(r'(?<=\s)STUCKER\s')
        x = re.sub(pattern, "STEUBEN", x)
        pattern = re.compile(r'(?<=\s)STATES\s')
        x = re.sub(pattern, "STATE", x)
        pattern = re.compile(r'(?<=\s)SCHAALS\s')
        x = re.sub(pattern, "SCHOLES", x)
        pattern = re.compile(r'(?<=\s)SUMME\s')
        x = re.sub(pattern, "SUMMIT", x)
        pattern = re.compile(r'(?<=\s)SCHOMERDOSA\s')
        x = re.sub(pattern, "SCHERMERHORN", x)
        pattern = re.compile(r'(?<=\s)DOUTH\s|(?<=\s)SONSE\s')
        x = re.sub(pattern, "SOUTH", x)
        pattern = re.compile(r'(?<=\s)STUYVESTANT\s')
        x = re.sub(pattern, "STUYVESANT", x)

        pattern = re.compile(r'(?<=\s)STONPSON\s')
        x = re.sub(pattern, "THOMPSON", x)
        pattern = re.compile(r'(?<=\s)TRAY\s')
        x = re.sub(pattern, "TROY", x)
        pattern = re.compile(r'(?<=\s)TAYLER\s')
        x = re.sub(pattern, "TAYLOR", x)

        pattern = re.compile(r'(?<=\s)WMON\s')
        x = re.sub(pattern, "UNION", x)

        pattern = re.compile(r'(?<=\s)(WAR CAREN)\s')
        x = re.sub(pattern, "VAN BUREN", x)
        pattern = re.compile(r'(?<=\s)VEMON\s')
        x = re.sub(pattern, "VERNON", x)
        pattern = re.compile(r'(?<=\s)VANDERLY\s|(?<=\s)VANDERSLIDE\s')
        x = re.sub(pattern, "VANDERBILT", x)

        pattern = re.compile(r'(?<=\s)WYONIA\s')
        x = re.sub(pattern, "WYONA", x)
        pattern = re.compile(r'(?<=\s)WITKINS\s')
        x = re.sub(pattern, "WATKINS", x)
        pattern = re.compile(r'(?<=\s)WALLWORTH\s')
        x = re.sub(pattern, "WALWORTH", x)
        pattern = re.compile(r'(?<=\s)WHIPPER\s')
        x = re.sub(pattern, "WHIPPLE", x)
        pattern = re.compile(r'(?<=\s)WALLABANK|(?<=\s)WALKABOUT\s')
        x = re.sub(pattern, "WALLABOUT", x)
        pattern = re.compile(r'(?<=\s)WASH\s|(?<=\s)WASTEWATER\s')
        x = re.sub(pattern, "WASHINGTON", x) 

        return x

    def street_number_name(x: str) -> str:
        """
        Introduction:
        -------------
        This function can help users to clean the street numbers in the street addresses. Specifically, turn the some desciption numbers to real numbers and extract the street numbers.

        Inputs:
        -------------
        'x': str. The street addresses in your dataset.

        Outputs:
        -------------
        'x': str. Street addresses with clean street numbers.

        Example:
        -------------
        >>> e = street_number_name('E FOURTEENTH STREET')
        >>> e
        'E 14 ST'
        """

        pattern = re.compile(r"(\d+)(\s)(ND|RD|TH)")
        x = re.sub(pattern, r"\1\3", x)
        pattern = re.compile(r"(\d+)(ST|ND|RD|TH|D)")
        x = re.sub(pattern, r"\1", x)
        pattern = re.compile(r"ELEVENTH\s") 
        x = re.sub(pattern, "11", x)
        pattern = re.compile(r"TWELFTH\s") 
        x = re.sub(pattern, "12", x)    
        pattern = re.compile(r"THIRTEENTH\s") 
        x = re.sub(pattern, "13", x)     
        pattern = re.compile(r"FORTEENTH\s|FOURTHENTH\s|\sFOURTEENTH$") 
        x = re.sub(pattern, "14", x)    
        pattern = re.compile(r"FIFTEENTH\s") 
        x = re.sub(pattern, "15", x)
        pattern = re.compile(r"SIXTEENTH\s") 
        x = re.sub(pattern, "16", x)
        pattern = re.compile(r"SEVENTEENTH\s") 
        x = re.sub(pattern, "17", x)
        pattern = re.compile(r"EIGHTEENTH\s|EIGHTEENTH$") 
        x = re.sub(pattern, "18", x)
        pattern = re.compile(r"NINETEENTH\s") 
        x = re.sub(pattern, "19", x)
        pattern = re.compile(r"TWENTIETH\s|TWENTIEFTH\s") 
        x = re.sub(pattern, "20", x)
        pattern = re.compile(r"THIRTIETH\s|THIRTIEFTH\s") 
        x = re.sub(pattern, "30", x)
        pattern = re.compile(r"FORTIETH\s|FOURTIETH\s") 
        x = re.sub(pattern, "40", x)
        pattern = re.compile(r"FIFTIETH\s") 
        x = re.sub(pattern, "50", x)
        pattern = re.compile(r"SIXTIETH\s") 
        x = re.sub(pattern, "60", x)
        pattern = re.compile(r"SEVENTIETH\s") 
        x = re.sub(pattern, "70", x)    
        pattern = re.compile(r"EIGHTIETH\s|EIGHTETH\s") 
        x = re.sub(pattern, "80", x) 
        pattern = re.compile(r"NINETIETH\s|NINTIETH\s") 
        x = re.sub(pattern, "90", x) 
        pattern = re.compile(r"FRIST\s|FRST\s|FIRST\s|ONE HUNDRED\s|ONE HUNRED\s|ONEHUNDRED\s|HUNDRED\s|HUDRED\s|HUNDED\s|ONE\s") 
        x = re.sub(pattern, "1", x) 
        pattern = re.compile(r"TWO HUNDRED\s|TWOHUNDRED\s|TWENTY\s|TWENTI\s|TENTI\s|SECOND\s|SECORD\s|SCOND\s|TWO\s") 
        x = re.sub(pattern, "2", x)
        pattern = re.compile(r"\s(THIRTY)\s|THIRTY\s|THIRTHY\s|THIRTEY\s|TIRTY\s|TRITHY\s|THRID\s|THIRD\s|TIRD\s|TRIH\s|THIR$|THREE\s") 
        x = re.sub(pattern, "3", x)    
        pattern = re.compile(r"FORTY\s|FORTH\s|FORSETH\s|FOURTY\s|FOURTHY\s|FOURTH\s|FOURT\s|FRTY\s|FROTH\s|FROUTH\s|FOUR\s") 
        x = re.sub(pattern, "4", x)
        pattern = re.compile(r"FIFTY\s|FIFTHE\s|FIFTHY\s|FIFTH\s|FIFTEY\s|FIFT\s|FIFT|FITY\s|FIFETH\s|FIFFTH\s|FIVE\s") 
        x = re.sub(pattern, "5", x)
        pattern = re.compile(r"SIXTY\s|SXTY\s|SIXY\s|SIXTHY\s|SIXTEY\s|SIXTH\s|SXTH\s|SITH\s|SIHXT\s|SIX\s") 
        x = re.sub(pattern, "6", x)
        pattern = re.compile(r"SEVENTEY\s|SVENTY\s|SEVENTI\s|SEVENTH\s|SEVENTY-|SEVENTY\s|SVEN\s|SVENTH\s|SEVENH\s|SEVENT\s|SEVEN\s") 
        x = re.sub(pattern, "7", x) 
        pattern = re.compile(r"EIGHTY\s|EIGHTEH\s|EIGHTEY\s|EIGHTE\s|EIGHTH\s|EITH\s|EIGHT\s|EIGHTTH\s|EIGTH\s|FIGHT\s") 
        x = re.sub(pattern, "8", x)       
        pattern = re.compile(r"UNITY\s|NINETY\s|NINETY-|NINETEY\s|NINETIETH\s|NINTH\s|NINTH$|NINTY\s") 
        x = re.sub(pattern, "9", x) 
        pattern = re.compile(r"TENTH\s") 
        x = re.sub(pattern, "10", x)     

        return x

    def special_case(x: str) -> str:

        """
        Introduction:
        -------------
        This function can help users to deal with some special cases in the street addresses. For example, correct the order of the street direction and street number.

        Inputs:
        -------------
        'x': str. The street addresses in your dataset.

        Outputs:
        -------------
        'x': str. Clean street addresses.

        Example:
        -------------
        >>> e = special_character('18 W ST')
        >>> e
        'W 18 ST'
        """

        pattern = re.compile(r'((\w+|\d+)\s\s(\w+|\d+))|((\w+|\d+)\s\s\s(\w+|\d+))')
        x = re.sub(pattern, r"\1 \2", x)

        # deal with the space between numbers - e.g. EAST 11 8 STREET, there is a space between 11 & 8
        pattern = re.compile(r'(\d+)\s(\d+)')
        x = re.sub(pattern, r"\1\2", x)

        # deal with special characters (\;';*'.')
        pattern = re.compile(r"\s(TO)\s") 
        x = re.sub(pattern, "-", x)
        pattern = re.compile(r"-") 
        x = re.sub(pattern, " ", x)
        pattern = re.compile(r"&") 
        x = re.sub(pattern, "AND", x)
        pattern = re.compile(r"[^\w\s]") 
        x = re.sub(pattern, "", x)

        # change the order of the address, e.g: 4 W to W 4 & AVE after A-Z and Digits
        pattern = re.compile(r'(\d+)\s(ST)\s([W|N|S|E])')
        x = re.sub(pattern, r'\3 \1 \2', x)
        pattern = re.compile(r'(CENTRAL PARK)\s([W|N|S|E])\s(AVE)')
        x = re.sub(pattern, r'\2 \1 \3', x)
        pattern = re.compile(r'(WASHINGTON SQ)\s([W|N|S|E])')
        x = re.sub(pattern, r'\2 \1', x)
        pattern = re.compile(r'(MORNINGSIDE AVE)\s([W|N|S|E])')
        x = re.sub(pattern, r'\2 \1', x)
        pattern = re.compile(r'(\w+|\d+)\s(\bN\b|\bW\b|\bS\b|\bE\b)')
        x = re.sub(pattern, r'\2 \1', x)
        pattern = re.compile(r'(AVE)\s(\d+)\s(\d+)')
        x = re.sub(pattern, r'\2-\3 \1', x) 
        pattern = re.compile(r'(AVE)\s(\d+)')
        x = re.sub(pattern, r'\2 \1', x) 
        pattern = re.compile(r'(AVE)\s([A-Z]+)')
        x = re.sub(pattern, r'\2 \1', x)  

        # When there is an 'AND' after hundred, I should convert it
        pattern = re.compile(r'(\d+)AND(\d+)')
        x = re.sub(pattern, r"\1\2", x)

        try:
            pattern = re.compile(r'(\d+)AND (\d+)')
            length = re.findall(pattern, x)
            length_list = list(length[0])

            if len(length_list[1]) == 1:
                x = re.sub(pattern, length_list[0] + '0' + length_list[1],  x)
            else:
                x = re.sub(pattern, length_list[0] + length_list[1],  x)
        except:
            x = x

        # when the address have no space between number and street type
        pattern = re.compile(r"([W|N|S|E])(\d+)")
        x = re.sub(pattern,r"\1 \2", x).strip()
        pattern = re.compile(r"(\d+)(ST|AVE|DR|CIR|CT|BLVD|ALY|PLZ|PARKS|PKWY|APPROACH|TER|PL|LN|BRG|HL|HTS|SLIP|ROW|SQ)")
        x = re.sub(pattern,r"\1 \2", x).strip()

        return x
    
    def clean(raw_street: str) -> str:  
        
        raw_street = raw_street.upper()
        street_direction_clean = Street.street_direction(raw_street)
        street_type_clean = Street.street_type(street_direction_clean)

        street_number_name_clean = Street.street_number_name(street_type_clean)
        street_name_clean = Street.street_name(street_number_name_clean)
        final_clean_address = Street.special_case(street_name_clean)

        pattern_street_direction = r'(?<=\s)[N|W|S|E]\s|^[N|W|S|E]\s|(?<=\s)[N|W|S|E]\d+'
        street_direction = ' '.join(re.findall(pattern_street_direction, final_clean_address))

        pattern_street_type = r'\sST$|\sDR$|\sCIR$|\sAVE$|\sCT$|\sBLVD$|\sALY$|\sPLZ$|\sPARK$|\sPKWY$|\sAPPROACH$|\sTER$|\sPL$|\sLN$|\sBRG$|(?<=\s)HL$|\sHTS$|\sSLIP$|\sROW$|\sSQ$'
        street_type = ' '.join(re.findall(pattern_street_type, final_clean_address))

        street_name = final_clean_address.replace(street_direction, '')
        street_name = street_name.replace(street_type, '')

        return raw_street, street_direction, street_name, street_type, final_clean_address
    
    def clean_hn(raw_street: str) -> str:  
        
        raw_street = raw_street.upper()
        street_direction_clean = Street.street_direction(raw_street)
        street_type_clean = Street.street_type(street_direction_clean)
        street_without_hn = Street.split(street_type_clean)

        street_number_name_clean = Street.street_number_name(street_without_hn)
        street_name_clean = Street.street_name(street_number_name_clean)
        final_clean_address = Street.special_case(street_name_clean)
        
        pattern_hn = r'^\d+\s|^\d+R\s'
        house_number = ' '.join(re.findall(pattern_hn, raw_street))

        pattern_number = re.compile(r'\d+')
        house_number = ' '.join(re.findall(pattern_number, house_number))

        pattern_street_direction = r'(?<=\s)[N|W|S|E]\s|^[N|W|S|E]\s|(?<=\s)[N|W|S|E]\d+'
        street_direction = ' '.join(re.findall(pattern_street_direction, final_clean_address))

        pattern_street_type = r'\sST$|\sDR$|\sCIR$|\sAVE$|\sCT$|\sBLVD$|\sALY$|\sPLZ$|\sPARK$|\sPKWY$|\sAPPROACH$|\sTER$|\sPL$|\sLN$|\sBRG$|(?<=\s)HL$|\sHTS$|\sSLIP$|\sROW$|\sSQ$'
        street_type = ' '.join(re.findall(pattern_street_type, final_clean_address))

        street_name = final_clean_address.replace(street_direction, '')
        street_name = street_name.replace(street_type, '')

        return raw_street, house_number, street_direction, street_name, street_type, final_clean_address






In [3]:
census_1910_h_mn_10k = pd.read_csv('census_1910_h_mn_10k.csv')
census_1910_h_mn_10k.columns

Index(['Record type', 'Standardized township (string)', 'County 2',
       'Enumeration district 2',
       'Consistent historical data person identifier',
       'Dwelling sequence number', 'Dwelling serial number',
       'Dwelling serial number 2', 'Household sequence within dwelling',
       'Household sequence within dwelling, 8 digit',
       'Household serial number 2',
       'Household serial number, before large group quarters were split up (100% datasets)',
       'Individual sequence number',
       'Large group quarters that was split up (100% datasets)', 'Line number',
       'Line number 2', 'Microfilm page number',
       'Number of families in household',
       'Number of person records in household, before large group quarters were split up  (100% datasets)',
       'House number', 'Street address 2'],
      dtype='object')

In [4]:
census_1910_h_mn_10k['Street address 2'] = census_1910_h_mn_10k['Street address 2'].map(lambda x: Street.clean(x))


In [5]:
census_1910_h_mn_10k

Unnamed: 0,Record type,Standardized township (string),County 2,Enumeration district 2,Consistent historical data person identifier,Dwelling sequence number,Dwelling serial number,Dwelling serial number 2,Household sequence within dwelling,"Household sequence within dwelling, 8 digit",...,"Household serial number, before large group quarters were split up (100% datasets)",Individual sequence number,Large group quarters that was split up (100% datasets),Line number,Line number 2,Microfilm page number,Number of families in household,"Number of person records in household, before large group quarters were split up (100% datasets)",House number,Street address 2
0,H,MANHATTAN,610,1256,,,60496,661,38,60496,...,60496,,0,51,,214,2,7,,"(38 STREET, , 38, ST, 38 ST)"
1,H,MANHATTAN,610,339,,,1943060,21190,18,1943060,...,1943060,,0,96,,379,1,2,,"(EAST 107TH STREET, E , 107, ST, E 107 ST)"
2,H,MANHATTAN,610,1043,,,477699,5196,50,477699,...,477699,,0,90,,93,1,5,337,"(81ST STREET, , 81, ST, 81 ST)"
3,H,MANHATTAN,610,292,,,217548,2408,92,217548,...,217548,,0,52,,699,2,4,,"(EAST 117TH STREET, E , 117, ST, E 117 ST)"
4,H,MANHATTAN,610,1318,,,739265,8103,25,739265,...,739265,,0,25,,466,1,1,,"(WEST 62ND STREET, W , 62, ST, W 62 ST)"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,H,MANHATTAN,610,1393,,,1985123,21672,26,1985123,...,1985123,,1,8,,1080,1,287,2346,"(BROADWAY BET 85886 ST BRETTON HA, , BROADWAY ..."
9996,H,MANHATTAN,610,1339,,,70035,771,62,70035,...,70035,,0,62,,47,1,2,318,"(49TH STREET, , 49, ST, 49 ST)"
9997,H,MANHATTAN,610,860,,,1335929,14544,64,1335929,...,1335929,,0,40,,350,1,3,,"(WEST EIGHTEENTH STREET, W , 18, ST, W 18 ST)"
9998,H,MANHATTAN,610,923,,,1074169,11698,82,1074169,...,1074169,,0,59,,833,3,4,414-12-10,"(EAST 13TH ST, E , 13, ST, E 13 ST)"


In [6]:
census_1910_h_mn_10k['raw_street'] = census_1910_h_mn_10k['Street address 2'].str[0]
census_1910_h_mn_10k['street_direction'] = census_1910_h_mn_10k['Street address 2'].str[1]
census_1910_h_mn_10k['street_name'] = census_1910_h_mn_10k['Street address 2'].str[2]
census_1910_h_mn_10k['street_type'] = census_1910_h_mn_10k['Street address 2'].str[3]
census_1910_h_mn_10k['final_clean_address'] = census_1910_h_mn_10k['Street address 2'].str[4]





In [7]:
census_1910_h_mn_10k

Unnamed: 0,Record type,Standardized township (string),County 2,Enumeration district 2,Consistent historical data person identifier,Dwelling sequence number,Dwelling serial number,Dwelling serial number 2,Household sequence within dwelling,"Household sequence within dwelling, 8 digit",...,Microfilm page number,Number of families in household,"Number of person records in household, before large group quarters were split up (100% datasets)",House number,Street address 2,raw_street,street_direction,street_name,street_type,final_clean_address
0,H,MANHATTAN,610,1256,,,60496,661,38,60496,...,214,2,7,,"(38 STREET, , 38, ST, 38 ST)",38 STREET,,38,ST,38 ST
1,H,MANHATTAN,610,339,,,1943060,21190,18,1943060,...,379,1,2,,"(EAST 107TH STREET, E , 107, ST, E 107 ST)",EAST 107TH STREET,E,107,ST,E 107 ST
2,H,MANHATTAN,610,1043,,,477699,5196,50,477699,...,93,1,5,337,"(81ST STREET, , 81, ST, 81 ST)",81ST STREET,,81,ST,81 ST
3,H,MANHATTAN,610,292,,,217548,2408,92,217548,...,699,2,4,,"(EAST 117TH STREET, E , 117, ST, E 117 ST)",EAST 117TH STREET,E,117,ST,E 117 ST
4,H,MANHATTAN,610,1318,,,739265,8103,25,739265,...,466,1,1,,"(WEST 62ND STREET, W , 62, ST, W 62 ST)",WEST 62ND STREET,W,62,ST,W 62 ST
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,H,MANHATTAN,610,1393,,,1985123,21672,26,1985123,...,1080,1,287,2346,"(BROADWAY BET 85886 ST BRETTON HA, , BROADWAY ...",BROADWAY BET 85886 ST BRETTON HA,,BROADWAY BET 85886 ST BRETTON HA,,BROADWAY BET 85886 ST BRETTON HA
9996,H,MANHATTAN,610,1339,,,70035,771,62,70035,...,47,1,2,318,"(49TH STREET, , 49, ST, 49 ST)",49TH STREET,,49,ST,49 ST
9997,H,MANHATTAN,610,860,,,1335929,14544,64,1335929,...,350,1,3,,"(WEST EIGHTEENTH STREET, W , 18, ST, W 18 ST)",WEST EIGHTEENTH STREET,W,18,ST,W 18 ST
9998,H,MANHATTAN,610,923,,,1074169,11698,82,1074169,...,833,3,4,414-12-10,"(EAST 13TH ST, E , 13, ST, E 13 ST)",EAST 13TH ST,E,13,ST,E 13 ST


In [8]:
census_1880_h_mn_10k = pd.read_csv('census_1880_h_mn_10k.csv')
census_1880_h_mn_10k.columns

Index(['rectype', 'year', 'serial', 'dwsize', 'city', 'pageno', 'microseq',
       'nfams', 'splithid', 'splitnum', 'mcd', 'county', 'enumdist', 'supdist',
       'street'],
      dtype='object')

In [9]:
census_1880_h_mn_10k['street'] = census_1880_h_mn_10k['street'].map(lambda x: Street.clean_hn(x))

census_1880_h_mn_10k['raw_street'] = census_1880_h_mn_10k['street'].str[0]
census_1880_h_mn_10k['house_number'] = census_1880_h_mn_10k['street'].str[1]
census_1880_h_mn_10k['street_direction'] = census_1880_h_mn_10k['street'].str[2]
census_1880_h_mn_10k['street_name'] = census_1880_h_mn_10k['street'].str[3]
census_1880_h_mn_10k['street_type'] = census_1880_h_mn_10k['street'].str[4]
census_1880_h_mn_10k['final_clean_address'] = census_1880_h_mn_10k['street'].str[5]



In [10]:
census_1880_h_mn_10k

Unnamed: 0,rectype,year,serial,dwsize,city,pageno,microseq,nfams,splithid,splitnum,...,county,enumdist,supdist,street,raw_street,house_number,street_direction,street_name,street_type,final_clean_address
0,H,1880,6320662,9999,"New York, NY",613,1,1 family or N/A,6115976,4,...,610,305,1,"(310 E HOUSTON ST, 310, E , HOUSTON, ST, E HO...",310 E HOUSTON ST,310,E,HOUSTON,ST,E HOUSTON ST
1,H,1880,6452608,52,"New York, NY",153,1,1 family or N/A,6230210,2,...,610,619,1,"(1168 2ND AV, 1168, , 2, AVE, 2 AVE)",1168 2ND AV,1168,,2,AVE,2 AVE
2,H,1880,6453777,9999,"New York, NY",212,1,1 family or N/A,6231379,5,...,610,622,1,"(313 60TH ST, 313, , 60, ST, 60 ST)",313 60TH ST,313,,60,ST,60 ST
3,H,1880,6293352,9999,"New York, NY",50,2,2 families,6090867,5,...,610,241,1,"(632 HUDSON ST, 632, , HUDSON, ST, HUDSON ST)",632 HUDSON ST,632,,HUDSON,ST,HUDSON ST
4,H,1880,6215033,19,"New York, NY",92,2,1 family or N/A,6017853,4,...,610,40,1,"(56 CENTRE ST, 56, , CENTRE, ST, CENTRE ST)",56 CENTRE ST,56,,CENTRE,ST,CENTRE ST
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,H,1880,6325737,9999,"New York, NY",184,1,1 family or N/A,6120796,4,...,610,317,1,"(400R 8TH ST, 400, , 8, ST, 8 ST)",400R 8TH ST,400,,8,ST,8 ST
9996,H,1880,6204444,9999,"New York, NY",205,1,1 family or N/A,6009270,4,...,610,11,1,"(88 ANN ST, 88, , ANN, ST, ANN ST)",88 ANN ST,88,,ANN,ST,ANN ST
9997,H,1880,6304543,9999,"New York, NY",551,1,1 family or N/A,6101805,6,...,610,266,1,"(171 4TH ST, 171, , 4, ST, 4 ST)",171 4TH ST,171,,4,ST,4 ST
9998,H,1880,6327464,44,"New York, NY",271,1,1 family or N/A,6122523,6,...,610,322,1,"(415 E 10TH ST, 415, E , 10, ST, E 10 ST)",415 E 10TH ST,415,E,10,ST,E 10 ST
