In [1]:
#import the packages that needed
import pandas as pd
import numpy as np
import regex as re
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('census_1910_mn_small.csv')

In [3]:
df.columns

Index(['Record type', 'Standardized township (string)', 'County 2',
       'Enumeration district 2',
       'Consistent historical data person identifier',
       'Dwelling sequence number', 'Dwelling serial number',
       'Dwelling serial number 2', 'Household sequence within dwelling',
       'Household sequence within dwelling, 8 digit',
       'Household serial number 2',
       'Household serial number, before large group quarters were split up (100% datasets)',
       'Individual sequence number',
       'Large group quarters that was split up (100% datasets)', 'Line number',
       'Line number 2', 'Microfilm page number',
       'Number of families in household',
       'Number of person records in household, before large group quarters were split up  (100% datasets)',
       'House number', 'Street address 2'],
      dtype='object')

In [4]:

def street_type(x):
    pattern = re.compile(r'\sSTREET|\sSTR|\sSTE$|\sSRT$|\sSR$\s|\sSST$|\sSEET$|\sTREET$|\sSHEER$|\sSHEE$|\sSTREE$|\sSREET$|\sREET$|\sSTEE$|\sST$')
    x = re.sub(pattern, " ST", x)
    pattern = re.compile(r'\sDRIVE$|\sDRV$|\sDRI$|\sDRIV$|\sDRIE$|\sD.$')
    x = re.sub(pattern, " DR", x)
    pattern = re.compile(r'\sCIRCLE$|\sCIRCL$|\sCICLE$|\sCIRC$|\sCIR$|\sCRL$|\sC.$')
    x = re.sub(pattern, " CIR", x)
    pattern = re.compile(r'\sAVENUE$|\sAVENU$|\sAVEN$|\sAVE$|\sAVN$\sAV$')
    x = re.sub(pattern, " AVE", x)
    pattern = re.compile(r"\sCOURT$|\sCT$|\sCRT$|\sCTR$|\sCOUR$|<=\sCOT$|\sCORT$")
    x = re.sub(pattern, " CT", x)
    pattern = re.compile(r"\sBOULEVARD$|\sBVLD|\sBL.$|\sB.$")
    x = re.sub(pattern, " BLVD", x)
    pattern = re.compile(r"\sROAD$|\sRD$|\sRAD$|\sROD$")
    x = re.sub(pattern, " RD", x)
    pattern = re.compile(r"\sALLEY$|\sALY$|\sALEY$|\sALL.$|\sA.$")
    x = re.sub(pattern, " ALY", x)
    pattern = re.compile(r"\sPLACE$|\sPL.$|\sP.$|\sPLAC$|\sPLCE$|\sPCE$")
    x = re.sub(pattern, " PL", x)
    pattern = re.compile(r"\sPK$|\sPRK$|\sPRAK$|\sPAK$")
    x = re.sub(pattern, " PARK", x)
    pattern = re.compile(r"\sPARKWAY$|\sPKWY$|\sPARKW$|\sPWY$|\sPKW$|\sPRKWY$|\sPKW$")
    x = re.sub(pattern, " PKWY", x)
    pattern = re.compile(r"\sAPPROA$|\sAPRCH$|\sAPPRCH$|\sAPPR$|\sAPR$")
    x = re.sub(pattern, " APPROACH", x)
    pattern = re.compile(r"\sTERRACE$|\sTERR$|\sTER$|\sTRCE$|\sTRC$|\sTR$")
    x = re.sub(pattern, " TER", x)
    pattern = re.compile(r"\sPLAZA$|\sPLZA$|\sPLZ$|\sPLAZ$|\sPZ$")
    x = re.sub(pattern, " PLZ", x)
    pattern = re.compile(r"\sLANE$|\sLNE$|\sLN$|\sLAN$")
    x = re.sub(pattern, " LN", x)
    pattern = re.compile(r"\sBRIDGE$|\sBRGD$|\sBRG$|\sBGE$")
    x = re.sub(pattern, " BRG",x)
    pattern = re.compile(r"\sHILL$|\sHLL$|\sHL$|\sHIL$")
    x = re.sub(pattern, " HL", x)
    pattern = re.compile(r"\sHEIGHTS$|\sHTS$|\sHT$|\sHEGHTS$|\sHEIGHT$|\sHHT$|\sHEIGT$") 
    x = re.sub(pattern, " HTS", x)
    pattern = re.compile(r"\sSLP$|\sSLEP$|\sSLIIP$|\sSLI$")
    x = re.sub(pattern, " SLIP", x)
    pattern = re.compile(r"\sROOW$|\sRO.$|\sRW$")
    x = re.sub(pattern, " ROW", x)
    pattern = re.compile(r"\sSQUARE$") 
    x = re.sub(pattern, " SQ", x)

    return x

df['Street address 2'] = df['Street address 2'].apply(lambda x : street_type(x))


In [5]:
df['Street address 2']

0               38 ST
1       EAST 107TH ST
2             81ST ST
3       EAST 117TH ST
4        WEST 62ND ST
            ...      
1995      WEST 158 ST
1996        E 78TH ST
1997          8TH AVE
1998    LEXINGTON AVE
1999    609 E 14TH ST
Name: Street address 2, Length: 2000, dtype: object

In [6]:
pattern_street_type_clean = r'(?<=\s)ST$|(?<=\s)DR$|(?<=\s)CIR$|(?<=\s)AVE$|(?<=\s)CT$|(?<=\s)BLVD$|(?<=\s)ALY$|(?<=\s)PLZ$|(?<=\s)PARK$|(?<=\s)PKWY$|(?<=\s)APPROACH$|(?<=\s)TER$|(?<=\s)PL$|(?<=\s)LN$|(?<=\s)BRG$|(?<=\s)HL$|(?<=\s)HTS$|(?<=\s)SLIP$|(?<=\s)ROW$|(?<=\s)SQ$'

def search_street_type(x):
    result = re.findall(pattern_street_type_clean, x)
    return "".join(result)

df['street_type_clean'] = df['Street address 2'].apply(lambda x : search_street_type(x))

In [7]:
df['street_type_clean']

0        ST
1        ST
2        ST
3        ST
4        ST
       ... 
1995     ST
1996     ST
1997    AVE
1998    AVE
1999     ST
Name: street_type_clean, Length: 2000, dtype: object

In [8]:
def street_name_clean(x):
    pattern = re.compile(r'\sALLANTIC\s|\sATLASTA\s')
    x = re.sub(pattern, "ATLANTIC", x)
    pattern = re.compile(r'\sALLEM\s')
    x = re.sub(pattern, "ALLEN", x)    

    pattern = re.compile(r'\sCROTON\s|\sGROTON\s')
    x = re.sub(pattern, "AUDUBON", x)    
    pattern = re.compile(r'\s(AT RINS)\s')
    x = re.sub(pattern, "ATKINS", x)     

    pattern = re.compile(r'\sBATTIE\s')
    x = re.sub(pattern, "BALTIC", x)      
    pattern = re.compile(r'\sBARREE\s')
    x = re.sub(pattern, "BARROW", x)   
    pattern = re.compile(r'\sBESSHLEY\s')
    x = re.sub(pattern, "BURLING", x)       
    pattern = re.compile(r'\sBIRY\s|\s(B WAY)\s|\sBWAY\s|\sBRAKSWAY\s')
    x = re.sub(pattern, "BROADWAY", x)         
    pattern = re.compile(r'\sBUSTWICK\s')
    x = re.sub(pattern, "BUSHWICK", x) 
    pattern = re.compile(r'\sBUTTER\s')
    x = re.sub(pattern, "BUTLER", x) 
    pattern = re.compile(r'\sBREEVORT\s')
    x = re.sub(pattern, "BREVOORT", x) 
    pattern = re.compile(r'\sBRENNEL\s|\sBROOMES\s|\sBROOM\s|\sBRANNAS\s|\sBROWN\s')
    x = re.sub(pattern, "BROOME", x) 
    pattern = re.compile(r'\sBLACKER\s|\sBLENKER\s')
    x = re.sub(pattern, "BLEECKER", x) 

    pattern = re.compile(r'\sCLAIR\s')
    x = re.sub(pattern, "CLASSON", x) 
    pattern = re.compile(r'\sCLISTEN\s')
    x = re.sub(pattern, "CLINTON", x)     
    pattern = re.compile(r'\sCHERY\s')
    x = re.sub(pattern, "CHERRY", x) 
    pattern = re.compile(r'\sCHRYSTEE\s|\sCHRYSTAL\s|\sCHTYSTIE\s|\sCHRYSTEL\s')
    x = re.sub(pattern, "CHRYSTIE", x)     
    pattern = re.compile(r'\sCARRAL\s|\sCOYAL\s')
    x = re.sub(pattern, "CANAL", x)    
    pattern = re.compile(r'\sCOLUMBIN\s')
    x = re.sub(pattern, "COLUMBIA", x) 
    pattern = re.compile(r'\sCAMNON\s')
    x = re.sub(pattern, "CANNON", x) 
    pattern = re.compile(r'\sCROWH\s')
    x = re.sub(pattern, "CROWN", x) 

    pattern = re.compile(r'\sDEVAL\s')
    x = re.sub(pattern, "DEVOE", x)
    pattern = re.compile(r'\sDEBOUCHEL\s')
    x = re.sub(pattern, "DEBEVOISE",x)
    pattern = re.compile(r'\sDAFONT\s')
    x = re.sub(pattern, "DUPONT", x)
    pattern = re.compile(r'\sDEGRAN\s')
    x = re.sub(pattern, "DEGRAW", x)
    pattern = re.compile(r'\sDENBO\s|\sDEKALB\s')
    x = re.sub(pattern, "DE KALB", x)
    pattern = re.compile(r'\sDELAMERE\s|\sDALANEY\s|\sDELANEY\s|\sDELANCY\s')
    x = re.sub(pattern, "DELANCEY", x) 

    pattern = re.compile(r'\sELTHZROTH\s|\sELLSWICK\s')
    x = re.sub(pattern, "ELLIOTT", x)
    pattern = re.compile(r'\sELDREDGE\s|\sCLARIDGE\s')
    x = re.sub(pattern, "ELDRIDGE", x) 
    pattern = re.compile(r'\sESSEY\s')
    x = re.sub(pattern, "ESSEX", x) 

    pattern = re.compile(r'\sFORSYTHE\s')
    x = re.sub(pattern, "FORSYTH", x) 
    pattern = re.compile(r'\sFLATHISH\s')
    x = re.sub(pattern, "FLATBUSH", x)

    pattern = re.compile(r'\sGLANCE\s')
    x = re.sub(pattern, "GRAND", x) 
    pattern = re.compile(r'\sGOAST\s')
    x = re.sub(pattern, "GOERCK", x)
    pattern = re.compile(r'\sGREENS\s')
    x = re.sub(pattern, "GREENE", x)
    pattern = re.compile(r'\sGREENRICH\s|\sGAMWICH\s')
    x = re.sub(pattern, "GREENWICH", x) 

    pattern = re.compile(r'\sHOUTON\s')
    x = re.sub(pattern,  "HOUSTON", x) 
    pattern = re.compile(r'\sHAVES\s')
    x = re.sub(pattern, "HEWES", x) 
    pattern = re.compile(r'\sHAKEY\s')
    x = re.sub(pattern, "HALSEY", x)
    pattern = re.compile(r'\sHEWEY\s')
    x = re.sub(pattern, "HENRY", x)
    pattern = re.compile(r'\sHICK\s')
    x = re.sub(pattern, "HICKS", x)
    pattern = re.compile(r'\sHUMBOLOT\s|\sHUMBARD\s|\sHUMBOLT\s')
    x = re.sub(pattern, "HUMBOLDT", x) 

    pattern = re.compile(r'\sJOHOM\s')
    x = re.sub(pattern, "JOHNS", x) 

    pattern = re.compile(r'\sKIOP\s|\sHEAP\s')
    x = re.sub(pattern, "KEAP", x) 

    pattern = re.compile(r'\s(LAY FAY ESTE)\s|\sLADORATT\s|\sLAFYAYETTE\s')
    x = re.sub(pattern, "LAFAYETTE", x)
    pattern = re.compile(r'\sLIRA\s|\sLOUMOR\s|\sLARMER\s')
    x = re.sub(pattern, "LORIMER", x)
    pattern = re.compile(r'\sLAAVIUK\s')
    x = re.sub(pattern, "LAWRENCE", x) 
    pattern = re.compile(r'\sLAIDLOW\s')
    x = re.sub(pattern, "LUDLOW", x) 
    pattern = re.compile(r'\sTEX|LEX\s') # perl = True
    x = re.sub(pattern, "LEXINGTON", x)   
    pattern = re.compile(r'\sREPPERTS\s')
    x = re.sub(pattern, "LEFFERTS", x)

    pattern = re.compile(r'\sPARLE\s|\sMALLE\s|\sMYETTE\s')
    x = re.sub(pattern, "MYRTLE", x)
    pattern = re.compile(r'\s(MC DOUGALL)\s|\s(MC DOUGAL)\s|\sMCDOUGALL\s')
    x = re.sub(pattern, "MCDOUGAL", x)
    pattern = re.compile(r'\s(MC DONOUGH)\s')
    x = re.sub(pattern, "MCDONOUGH", x)
    pattern = re.compile(r'\sMANZA\s|\sMAREY\s')
    x = re.sub(pattern, "MARCY", x)
    pattern = re.compile(r'\sMESCOLE\s')
    x = re.sub(pattern, "MESEROLE", x)
    pattern = re.compile(r'\sMEASE\s')
    x = re.sub(pattern, "MOORE", x)
    pattern = re.compile(r'\sMEDDLER\s')
    x = re.sub(pattern, "MIDDLETON", x)
    pattern = re.compile(r'\sMANGEN\s')
    x = re.sub(pattern, "MANGIN", x)
    pattern = re.compile(r'\sHAULL\s|\sMALLERY\s')
    x = re.sub(pattern, "MULBERRY", x)

    pattern = re.compile(r'\sNAPOLK\s')
    x = re.sub(pattern, "NORFOLK", x)
    pattern = re.compile(r'\s(VAST AND)\s')
    x = re.sub(pattern, "NOSTRAND", x)

    pattern = re.compile(r'\sDAK\s')
    x = re.sub(pattern, "OAK", x)
    pattern = re.compile(r'\sOLWEN\s')
    x = re.sub(pattern, "OLIVER", x)
    pattern = re.compile(r'\sGERHARD\s')
    x = re.sub(pattern, "ORCHARD", x)

    pattern = re.compile(r'\sPUTT\s')
    x = re.sub(pattern, "PITT", x)
    pattern = re.compile(r'\sPERROTT\s|\s(PERROTT PREMPONT)\s')
    x = re.sub(pattern, "PIERREPONT", x)
    pattern = re.compile(r'\sPLAD\s')
    x = re.sub(pattern, "PLACE", x)
    pattern = re.compile(r'\sPRUFER\s')
    x = re.sub(pattern, "PROSPECT", x)
    pattern = re.compile(r'\sPREDIDUNT\s')
    x = re.sub(pattern, "PRESIDENT", x)
    pattern = re.compile(r'\sPALOKA\s')
    x = re.sub(pattern, "PULASKI", x)

    pattern = re.compile(r'\sRUTHIE\s')
    x = re.sub(pattern, "RUTLEDGE", x)
    pattern = re.compile(r'\sRIDAL\s')
    x = re.sub(pattern, "RIDGE", x)
    pattern = re.compile(r'\sRAYSON\s')
    x = re.sub(pattern, "RYERSON", x)
    pattern = re.compile(r'\sREVENTON\s')
    x = re.sub(pattern, "RIVINGTON", x)
    pattern = re.compile(r'\sRUALMAINE\s|\s(RICER SIDE)\s')
    x = re.sub(pattern, "RIVERSIDE", x)
    pattern = re.compile(r'\sREDERICK\s|\sRENNICK\s')
    x = re.sub(pattern, "RENWICK", x)

    pattern = re.compile(r'\sSELLTOWN\s')
    x = re.sub(pattern, "SULLIVAN", x)
    pattern = re.compile(r'\sSISH\s')
    x = re.sub(pattern, "SIDE", x)
    pattern = re.compile(r'\sSTUCKER\s')
    x = re.sub(pattern, "STEUBEN", x)
    pattern = re.compile(r'\sSTATES\s')
    x = re.sub(pattern, "STATE", x)
    pattern = re.compile(r'\sSCHAALS\s')
    x = re.sub(pattern, "SCHOLES", x)
    pattern = re.compile(r'\sSUMME\s')
    x = re.sub(pattern, "SUMMIT", x)
    pattern = re.compile(r'\sSCHOMERDOSA\s')
    x = re.sub(pattern, "SCHERMERHORN", x)
    pattern = re.compile(r'\sDOUTH\s|\sSONSE\s')
    x = re.sub(pattern, "SOUTH", x)
    pattern = re.compile(r'\sSTUYVESTANT\s')
    x = re.sub(pattern, "STUYVESANT", x)

    pattern = re.compile(r'\sSTONPSON\s')
    x = re.sub(pattern, "THOMPSON", x)
    pattern = re.compile(r'\sTRAY\s')
    x = re.sub(pattern, "TROY", x)
    pattern = re.compile(r'\sTAYLER\s')
    x = re.sub(pattern, "TAYLOR", x)

    pattern = re.compile(r'\sWMON\s')
    x = re.sub(pattern, "UNION", x)

    pattern = re.compile(r'\s(WAR CAREN)\s')
    x = re.sub(pattern, "VAN BUREN", x)
    pattern = re.compile(r'\sVEMON\s')
    x = re.sub(pattern, "VERNON", x)
    pattern = re.compile(r'\sVANDERLY\s|\sVANDERSLIDE\s')
    x = re.sub(pattern, "VANDERBILT", x)

    pattern = re.compile(r'\sWYONIA\s')
    x = re.sub(pattern, "WYONA", x)
    pattern = re.compile(r'\sWITKINS\s')
    x = re.sub(pattern, "WATKINS", x)
    pattern = re.compile(r'\sWALLWORTH\s')
    x = re.sub(pattern, "WALWORTH", x)
    pattern = re.compile(r'\sWHIPPER\s')
    x = re.sub(pattern, "WHIPPLE", x)
    pattern = re.compile(r'\sWALLABANK|\sWALKABOUT\s')
    x = re.sub(pattern, "WALLABOUT", x)
    pattern = re.compile(r'\sWASH\s|\sWASTEWATER\s')
    x = re.sub(pattern, "WASHINGTON", x) 

    return x

df['Street address 2'] = df['Street address 2'].apply(lambda x : street_name_clean(x))



In [13]:
pd.set_option('display.max_rows', None)
df['Street address 2']

0                                  38 ST
1                          EAST 107TH ST
2                                81ST ST
3                          EAST 117TH ST
4                           WEST 62ND ST
5                           WEST 90TH ST
6                                    A A
7                            EAST 8TH ST
8                                13TH ST
9               AMSTERDAM AVE 190-195 ST
10                          WEST 78TH ST
11                             CHERRY ST
12                 CENTRAL PARK WEST AVE
13                          EAST 50TH ST
14                            WEST 26 ST
15                          WEST 61ST ST
16                            CORNICE ST
17                         AMSTERDAM AVE
18                            EAST 32 ST
19                              WEST 145
20                         WEST 117TH ST
21                         EAST FIFTH ST
22                                 84 ST
23                          WEST 43RD ST
24              

In [10]:
df.street_name_clean.unique()

array([''], dtype=object)

In [14]:
print(df['street_type_clean'])
print(df['street_type_clean'].unique())

0        ST
1        ST
2        ST
3        ST
4        ST
       ... 
1995     ST
1996     ST
1997    AVE
1998    AVE
1999     ST
Name: street_type_clean, Length: 2000, dtype: object
['ST' '' 'AVE' 'PL' 'CIR' 'ALY' 'PARK' 'SQ' 'LN' 'DR' 'TER']


In [4]:
class Street:
    def street_type(x):
        pattern = re.compile(r'\sSTREET|\sSTR|\sSTE$|\sSRT$|\sSR$\s|\sSST$|\sSEET$|\sTREET$|\sSHEER$|\sSHEE$|\sSTREE$|\sSREET$|\sREET$|\sSTEE$|\sST$')
        x = re.sub(pattern, " ST", x)
        pattern = re.compile(r'\sDRIVE$|\sDRV$|\sDRI$|\sDRIV$|\sDRIE$|\sD.$')
        x = re.sub(pattern, " DR", x)
        pattern = re.compile(r'\sCIRCLE$|\sCIRCL$|\sCICLE$|\sCIRC$|\sCIR$|\sCRL$|\sC.$')
        x = re.sub(pattern, " CIR", x)
        pattern = re.compile(r'\sAVENUE$|\sAVENU$|\sAVEN$|\sAVE$|\sAVN$\sAV$')
        x = re.sub(pattern, " AVE", x)
        pattern = re.compile(r"\sCOURT$|\sCT$|\sCRT$|\sCTR$|\sCOUR$|<=\sCOT$|\sCORT$")
        x = re.sub(pattern, " CT", x)
        pattern = re.compile(r"\sBOULEVARD$|\sBVLD|\sBL.$|\sB.$")
        x = re.sub(pattern, " BLVD", x)
        pattern = re.compile(r"\sROAD$|\sRD$|\sRAD$|\sROD$")
        x = re.sub(pattern, " RD", x)
        pattern = re.compile(r"\sALLEY$|\sALY$|\sALEY$|\sALL.$|\sA.$")
        x = re.sub(pattern, " ALY", x)
        pattern = re.compile(r"\sPLACE$|\sPL.$|\sP.$|\sPLAC$|\sPLCE$|\sPCE$")
        x = re.sub(pattern, " PL", x)
        pattern = re.compile(r"\sPK$|\sPRK$|\sPRAK$|\sPAK$")
        x = re.sub(pattern, " PARK", x)
        pattern = re.compile(r"\sPARKWAY$|\sPKWY$|\sPARKW$|\sPWY$|\sPKW$|\sPRKWY$|\sPKW$")
        x = re.sub(pattern, " PKWY", x)
        pattern = re.compile(r"\sAPPROA$|\sAPRCH$|\sAPPRCH$|\sAPPR$|\sAPR$")
        x = re.sub(pattern, " APPROACH", x)
        pattern = re.compile(r"\sTERRACE$|\sTERR$|\sTER$|\sTRCE$|\sTRC$|\sTR$")
        x = re.sub(pattern, " TER", x)
        pattern = re.compile(r"\sPLAZA$|\sPLZA$|\sPLZ$|\sPLAZ$|\sPZ$")
        x = re.sub(pattern, " PLZ", x)
        pattern = re.compile(r"\sLANE$|\sLNE$|\sLN$|\sLAN$")
        x = re.sub(pattern, " LN", x)
        pattern = re.compile(r"\sBRIDGE$|\sBRGD$|\sBRG$|\sBGE$")
        x = re.sub(pattern, " BRG",x)
        pattern = re.compile(r"\sHILL$|\sHLL$|\sHL$|\sHIL$")
        x = re.sub(pattern, " HL", x)
        pattern = re.compile(r"\sHEIGHTS$|\sHTS$|\sHT$|\sHEGHTS$|\sHEIGHT$|\sHHT$|\sHEIGT$") 
        x = re.sub(pattern, " HTS", x)
        pattern = re.compile(r"\sSLP$|\sSLEP$|\sSLIIP$|\sSLI$")
        x = re.sub(pattern, " SLIP", x)
        pattern = re.compile(r"\sROOW$|\sRO.$|\sRW$")
        x = re.sub(pattern, " ROW", x)
        pattern = re.compile(r"\sSQUARE$") 
        x = re.sub(pattern, " SQ", x)
        
        return x
    
    def direction(x):
         # The orientations in the addresses
        pattern = re.compile(r'\sN\s')
        re.sub(pattern, " N ", x)
        pattern = re.compile(r'\sS\s')
        re.sub(pattern, " S ", x)
        pattern = re.compile(r'\sE\s')
        re.sub(pattern, " E ", x)
        pattern = re.compile(r'\sW\s')
        re.sub(pattern, " W ", x)
        
        return x

    def street_name_clean(x):
        pattern = re.compile(r'^\bST\b')
        x = re.sub(pattern, "SAINT", x)
        pattern = re.compile(r'\sHOUSE|\sHOSTEL|\sHOTEL|\sLODGE|\sLODGING')
        x = re.sub(pattern,"", x)
        
        pattern = re.compile(r'(\s|^)ALLANTIC|(\s|^)ATLASTA')
        x = re.sub(pattern, "ATLANTIC", x)
        pattern = re.compile(r'(\s|^)ALLEM')
        x = re.sub(pattern, "ALLEN", x)    
   
        pattern = re.compile(r'(\s|^)CROTON|(\s|^)GROTON')
        x = re.sub(pattern, "AUDUBON", x)    
        pattern = re.compile(r'(\s|^)(AT RINS)')
        x = re.sub(pattern, "ATKINS", x)     

        pattern = re.compile(r'(\s|^)BATTIE')
        x = re.sub(pattern, "BALTIC", x)      
        pattern = re.compile(r'(\s|^)BARREE')
        x = re.sub(pattern, "BARROW", x)   
        pattern = re.compile(r'(\s|^)BESSHLEY')
        x = re.sub(pattern, "BURLING", x)       
        pattern = re.compile(r'(\s|^)BIRY|(\s|^)(B WAY)|(\s|^)BWAY|(\s|^)BRAKSWAY')
        x = re.sub(pattern, "BROADWAY", x)         
        pattern = re.compile(r'(\s|^)BUSTWICK')
        x = re.sub(pattern, "BUSHWICK", x) 
        pattern = re.compile(r'(\s|^)BUTTER')
        x = re.sub(pattern, "BUTLER", x) 
        pattern = re.compile(r'(\s|^)BREEVORT')
        x = re.sub(pattern, "BREVOORT", x) 
        pattern = re.compile(r'(\s|^)BRENNEL|\sBROOMES\s|\sBROOM|(\s|^)BRANNAS|(\s|^)BROWN')
        x = re.sub(pattern, "BROOME", x) 
        pattern = re.compile(r'(\s|^)BLACKER|(\s|^)BLENKER')
        x = re.sub(pattern, "BLEECKER", x) 

        pattern = re.compile(r'(\s|^)CLAIR')
        x = re.sub(pattern, "CLASSON", x) 
        pattern = re.compile(r'(\s|^)CLISTEN')
        x = re.sub(pattern, "CLINTON", x)     
        pattern = re.compile(r'(\s|^)CHERY')
        x = re.sub(pattern, "CHERRY", x) 
        pattern = re.compile(r'(\s|^)CHRYSTEE|(\s|^)CHRYSTAL|(\s|^)CHTYSTIE|(\s|^)CHRYSTEL')
        x = re.sub(pattern, "CHRYSTIE", x)     
        pattern = re.compile(r'(\s|^)CARRAL|(\s|^)COYAL')
        x = re.sub(pattern, "CANAL", x)    
        pattern = re.compile(r'(\s|^)COLUMBIN')
        x = re.sub(pattern, "COLUMBIA", x) 
        pattern = re.compile(r'(\s|^)CAMNON')
        x = re.sub(pattern, "CANNON", x) 
        pattern = re.compile(r'(\s|^)CROWH')
        x = re.sub(pattern, "CROWN", x) 

        pattern = re.compile(r'(\s|^)DEVAL')
        x = re.sub(pattern, "DEVOE", x)
        pattern = re.compile(r'(\s|^)DEBOUCHEL')
        x = re.sub(pattern, "DEBEVOISE",x)
        pattern = re.compile(r'(\s|^)DAFONT')
        x = re.sub(pattern, "DUPONT", x)
        pattern = re.compile(r'(\s|^)DEGRAN')
        x = re.sub(pattern, "DEGRAW", x)
        pattern = re.compile(r'(\s|^)DENBO|(\s|^)DEKALB')
        x = re.sub(pattern, "DE KALB", x)
        pattern = re.compile(r'(\s|^)DELAMERE|(\s|^)DALANEY|(\s|^)DELANEY|(\s|^)DELANCY')
        x = re.sub(pattern, "DELANCEY", x) 

        pattern = re.compile(r'(\s|^)ELTHZROTH|(\s|^)ELLSWICK')
        x = re.sub(pattern, "ELLIOTT", x)
        pattern = re.compile(r'(\s|^)ELDREDGE|(\s|^)CLARIDGE')
        x = re.sub(pattern, "ELDRIDGE", x) 
        pattern = re.compile(r'(\s|^)ESSEY')
        x = re.sub(pattern, "ESSEX", x) 

        pattern = re.compile(r'(\s|^)FORSYTHE')
        x = re.sub(pattern, "FORSYTH", x) 
        pattern = re.compile(r'(\s|^)FLATHISH')
        x = re.sub(pattern, "FLATBUSH", x)

        pattern = re.compile(r'(\s|^)GLANCE')
        x = re.sub(pattern, "GRAND", x) 
        pattern = re.compile(r'(\s|^)GOAST')
        x = re.sub(pattern, "GOERCK", x)
        pattern = re.compile(r'(\s|^)GREENS')
        x = re.sub(pattern, "GREENE", x)
        pattern = re.compile(r'(\s|^)GREENRICH|(\s|^)GAMWICH')
        x = re.sub(pattern, "GREENWICH", x) 

        pattern = re.compile(r'(\s|^)HOUTON')
        x = re.sub(pattern,  "HOUSTON", x) 
        pattern = re.compile(r'(\s|^)HAVES')
        x = re.sub(pattern, "HEWES", x) 
        pattern = re.compile(r'(\s|^)HAKEY')
        x = re.sub(pattern, "HALSEY", x)
        pattern = re.compile(r'(\s|^)HEWEY')
        x = re.sub(pattern, "HENRY", x)
        pattern = re.compile(r'(\s|^)HICK')
        x = re.sub(pattern, "HICKS", x)
        pattern = re.compile(r'(\s|^)HUMBOLOT|(\s|^)HUMBARD|(\s|^)HUMBOLT')
        x = re.sub(pattern, "HUMBOLDT", x) 

        pattern = re.compile(r'(\s|^)JOHOM')
        x = re.sub(pattern, "JOHNS", x) 

        pattern = re.compile(r'(\s|^)KIOP|(\s|^)HEAP')
        x = re.sub(pattern, "KEAP", x) 

        pattern = re.compile(r'(\s|^)(LAY FAY ESTE)|(\s|^)LADORATT|(\s|^)LAFYAYETTE')
        x = re.sub(pattern, "LAFAYETTE", x)
        pattern = re.compile(r'(\s|^)LIRA|(\s|^)LOUMOR|(\s|^)LARMER')
        x = re.sub(pattern, "LORIMER", x)
        pattern = re.compile(r'(\s|^)LAAVIUK')
        x = re.sub(pattern, "LAWRENCE", x) 
        pattern = re.compile(r'(\s|^)LAIDLOW')
        x = re.sub(pattern, "LUDLOW", x) 
        pattern = re.compile(r'(\s|^)TEX|LEX(?=\s)') # perl = True
        x = re.sub(pattern, "LEXINGTON", x)   
        pattern = re.compile(r'(\s|^)REPPERTS')
        x = re.sub(pattern, "LEFFERTS", x)

        pattern = re.compile(r'(\s|^)PARLE|(\s|^)MALLE|(\s|^)MYETTE')
        x = re.sub(pattern, "MYRTLE", x)
        pattern = re.compile(r'(\s|^)(MC DOUGALL)|(\s|^)(MC DOUGAL)|(\s|^)(MCDOUGALL)')
        x = re.sub(pattern, "MCDOUGAL", x)
        pattern = re.compile(r'(\s|^)(MC DONOUGH)')
        x = re.sub(pattern, "MCDONOUGH", x)
        pattern = re.compile(r'(\s|^)MANZA|(\s|^)MAREY')
        x = re.sub(pattern, "MARCY", x)
        pattern = re.compile(r'(\s|^)MESCOLE')
        x = re.sub(pattern, "MESEROLE", x)
        pattern = re.compile(r'(\s|^)MEASE')
        x = re.sub(pattern, "MOORE", x)
        pattern = re.compile(r'(\s|^)MEDDLER')
        x = re.sub(pattern, "MIDDLETON", x)
        pattern = re.compile(r'(\s|^)MANGEN')
        x = re.sub(pattern, "MANGIN", x)
        pattern = re.compile(r'(\s|^)HAULL|(\s|^)MALLERY')
        x = re.sub(pattern, "MULBERRY", x)

        pattern = re.compile(r'(\s|^)NAPOLK')
        x = re.sub(pattern, "NORFOLK", x)
        pattern = re.compile(r'(\s|^)(VAST AND)')
        x = re.sub(pattern, "NOSTRAND", x)

        pattern = re.compile(r'(\s|^)DAK')
        x = re.sub(pattern, "OAK", x)
        pattern = re.compile(r'(\s|^)OLWEN')
        x = re.sub(pattern, "OLIVER", x)
        pattern = re.compile(r'(\s|^)GERHARD')
        x = re.sub(pattern, "ORCHARD", x)

        pattern = re.compile(r'(\s|^)PUTT')
        x = re.sub(pattern, "PITT", x)
        pattern = re.compile(r'(\s|^)PERROTT|(\s|^)(PERROTT PREMPONT)')
        x = re.sub(pattern, "PIERREPONT", x)
        pattern = re.compile(r'(\s|^)PLAD')
        x = re.sub(pattern, "PLACE", x)
        pattern = re.compile(r'(\s|^)PRUFER')
        x = re.sub(pattern, "PROSPECT", x)
        pattern = re.compile(r'(\s|^)PREDIDUNT')
        x = re.sub(pattern, "PRESIDENT", x)
        pattern = re.compile(r'(\s|^)PALOKA')
        x = re.sub(pattern, "PULASKI", x)

        pattern = re.compile(r'(\s|^)RUTHIE')
        x = re.sub(pattern, "RUTLEDGE", x)
        pattern = re.compile(r'(\s|^)RIDAL')
        x = re.sub(pattern, "RIDGE", x)
        pattern = re.compile(r'(\s|^)RAYSON')
        x = re.sub(pattern, "RYERSON", x)
        pattern = re.compile(r'(\s|^)REVENTON')
        x = re.sub(pattern, "RIVINGTON", x)
        pattern = re.compile(r'(\s|^)RUALMAINE|(\s|^)(RICER SIDE)')
        x = re.sub(pattern, "RIVERSIDE", x)
        pattern = re.compile(r'(\s|^)REDERICK|(\s|^)RENNICK')
        x = re.sub(pattern, "RENWICK", x)

        pattern = re.compile(r'(\s|^)SELLTOWN')
        x = re.sub(pattern, "SULLIVAN", x)
        pattern = re.compile(r'(\s|^)SISH')
        x = re.sub(pattern, "SIDE", x)
        pattern = re.compile(r'(\s|^)STUCKER')
        x = re.sub(pattern, "STEUBEN", x)
        pattern = re.compile(r'(\s|^)STATES')
        x = re.sub(pattern, "STATE", x)
        pattern = re.compile(r'(\s|^)SCHAALS')
        x = re.sub(pattern, "SCHOLES", x)
        pattern = re.compile(r'(\s|^)SUMME')
        x = re.sub(pattern, "SUMMIT", x)
        pattern = re.compile(r'(\s|^)SCHOMERDOSA')
        x = re.sub(pattern, "SCHERMERHORN", x)
        pattern = re.compile(r'(\s|^)DOUTH|(\s|^)SONSE')
        x = re.sub(pattern, "SOUTH", x)
        pattern = re.compile(r'(\s|^)STUYVESTANT')
        x = re.sub(pattern, "STUYVESANT", x)

        pattern = re.compile(r'(\s|^)STONPSON')
        x = re.sub(pattern, "THOMPSON", x)
        pattern = re.compile(r'(\s|^)TRAY')
        x = re.sub(pattern, "TROY", x)
        pattern = re.compile(r'(\s|^)TAYLER')
        x = re.sub(pattern, "TAYLOR", x)

        pattern = re.compile(r'(\s|^)WMON')
        x = re.sub(pattern, "UNION", x)

        pattern = re.compile(r'(\s|^)(WAR CAREN)')
        x = re.sub(pattern, "VAN BUREN", x)
        pattern = re.compile(r'(\s|^)VEMON')
        x = re.sub(pattern, "VERNON", x)
        pattern = re.compile(r'(\s|^)VANDERLY|(\s|^)VANDERSLIDE')
        x = re.sub(pattern, "VANDERBILT", x)

        pattern = re.compile(r'(\s|^)WYONIA')
        x = re.sub(pattern, "WYONA", x)
        pattern = re.compile(r'(\s|^)WITKINS')
        x = re.sub(pattern, "WATKINS", x)
        pattern = re.compile(r'(\s|^)WALLWORTH')
        x = re.sub(pattern, "WALWORTH", x)
        pattern = re.compile(r'(\s|^)WHIPPER')
        x = re.sub(pattern, "WHIPPLE", x)
        pattern = re.compile(r'(\s|^)WALLABANK|(\s|^)WALKABOUT')
        x = re.sub(pattern, "WALLABOUT", x)
        pattern = re.compile(r'\sWASH\s|(\s|^)WASTEWATER\s')
        x = re.sub(pattern, "WASHINGTON", x) 
        
        pattern = re.compile(r'\s&\s')
        x = re.sub(pattern, "", x)
    #     pattern = re.compile(r"(\d)ST|ND|RD|TH\b")
    #     x = re.sub(pattern, "", x)

        return x
    
    def space_problem(x):
        # overall spacing problem extra white space
        pattern = re.compile(r'((\w+|\d+)\s\s(\w+|\d+))|((\w+|\d+)\s\s\s(\w+|\d+))')
        x = re.sub(pattern, r"\1 \2", x)

        # change the order of the address, e.g: 4 W to W 4 & AVE after A-Z and Digits
        pattern = re.compile(r'(\w+|\d+)\s(\bN\b|\bW\b|\bS\b|\bE\b)')
        x = re.sub(pattern, r'\2 \1', x)
        pattern = re.compile(r'(AVE)\s(\d+)\s(\d+)')
        x = re.sub(pattern, r'\2-\3 \1', x) 
        pattern = re.compile(r'(AVE)\s([A-Z])')
        x = re.sub(pattern, r'\2 \1', x) 
    #     pattern = re.compile(r'(AVE)\s(\b[A-Z])')
    #     x = re.sub(pattern, r'\2 \1', x) 
    
        return x
    
    def seperate(df, column):
        
        df['original'] = df[column]
        df[column] = df[column].apply(lambda x: str(x.upper()))
        df[column] = df[column].apply(lambda x: Street.name_end(str(x)))
        df[column] = df[column].apply(lambda x: Street.direction(str(x)))
        df[column] = df[column].apply(lambda x: Street.special_characters(str(x)))
        df[column] = df[column].apply(lambda x: Street.number_clean(str(x)))
        df[column] = df[column].apply(lambda x: Street.street_order(str(x)))
        df[column] = df[column].apply(lambda x: Street.name_check(str(x)))
        df[column] = df[column].apply(lambda x: Street.space_problem(str(x)))
        df['new'] = df[column]
        df.drop(column, axis=1, inplace = True)
    
        return df
    


In [5]:
pattern = {
    'direction' : r'WEST|EAST|NORTH|SOUTH|\sW\s|\sE\s|\sN\s|\sS\s',
    'end' : r'\sSTREET$|\bSTREET$|\sSTR$|\sSTE$|\sSRT$|\sSR$\s|SRT$|\sSST$|\sSEET$|\sTREET$|\sSHEER$|\sSHEE$|\sSTREE$|\sSREET$|\sREET$|\sSTEE$|\sST$',
    'street_type': r'\sDRIVE$|\sDR$|\sDV$|\sDE$|\sDRV$|\sDRI$|\sDRIV$|\sDRIE$|\sCIRCLE|\sCIR|\sCRL|\sCIRC|\sCR|\sCL|\sCIRCL|\sCICLE|(?<=\s)COURT|CT|CRT|CRT|CTR|(?<=\s)COUR|(?<=\s)COT|(?<=\s)CORT|\sBOULEVARD|\sBVLD|\sBV|\sBLD$|\sBD|\sBL$|\sBLV$|\sROAD|\sRD|\sRAD|\sROD|\sALLEY|\sALY|\sAL|\sALLY|\sALEY|\sALLE|\sAY|\sPLACE|\sPL|\sPLC|\sPLE|\sPC|\sPLAC|\sPLCE|\sPCE|\sPK|\sPRK|\sPRAK|\sPAK|\sPARKWAY|\sPKWY|\sPARKW|\sPWY|\sPKW|\sPRKWY|\sPKWY|\sPKW|\sAPPR|\sAPR|\sAPPROA|\sAPRCH|\sAPPRCH|\sTERRACE|\sTER|\sTERR|\sTRC|\sTRCE|\sTR|\sPLAZA|\sPLZ|\sPLAZ|\sPZ|\sPLZA|\sLANE|\sLN|\sLNE|\sLAN|\sHILL|\sHL|\sHLL|\sHIL|\sHEIGHTS|\sHTS|\sHT|\sHEIGHT|\sHEGHTS|\sHHT|\sHEIGT|\sSLP|\sSLEP|\sSLIIP|\sSLI|\sRON|\sRW|\sROE|\sROOW|\sSQUARE',
#     'number': r'FRIST|FRST|FIRST|ONE|ONE HUNDRED|ONEHUNDRED|HUNDRED|HUDRED|HUNDED|TWO HUNDRED|TWOHUNDRED|TWENTY|TWENTI|TENTI|SECOND|SECORD|SCOND|TWO|THIRTY|THIRTHY|THIRTEY|TIRTY|TRITHY|THRID|THIRD|TIRD|TRIH|THREE|FORTY|FORTH|FOURTHY|FOURT|FRTY|FROTH|FROUTH|FOUR|FIFTY|FIFTHE|FIFTHY|FIFTH|FIFTEY|FIFT|FITY|FIFETH|FIFFTH|FIVE|SIXTY|SXTY|SIXY|SIXTHY|SIXTEY|SIXTH|SXTH|SITH|SIHXT|SIX|SEVENT|SEVENTY|SEVENTEY|SVENTY|SEVENTI|SEVENTH|SVEN|SVENTH|SEVENH|SEVEN|EIGHTY|EIGHTEY|EIGHTE|EIGHTH|EIGHTEH|EITH|EIGHT|EIGHTTH|EIGTH|FIGHT|UNITY|NINTH|NINETY|NINETEY|NINETIETH|NINTY|TENTH|ELEVENTH|TWELFTH|THIRTEENTH|FORTEENTH|FIFTEENTH|SIXTEENTH|SEVENTEENTH|EIGHTEENTH|NINETEENTH|TWENTIETH|TWENTIEFTH|THIRTIETH|THIRTIEFTH|FORTIETH|FOURTIETH|FOURTHENTH|FIFTIETH|SIXTIETH|SEVENTIETH|SEVENTY|EIGHTIETH|EIGHTETH|NINETIETH|NINTIETH',
    'number': r'FRIST|FRST|FIRST|ONE|ONE HUNDRED|ONEHUNDRED|HUNDRED|HUDRED|HUNDED|TWO HUNDRED|TWOHUNDRED|TWENTY|TWENTI|TENTI|SECOND|SECORD|SCOND|TWO|THIRTY|THIRTHY|THIRTEY|TIRTY|TRITHY|THRID|THIRD|TIRD|TRIH|THREE|FORTIETH|FOURTIETH|FORTEENTH|FOURTHENTH|FORTY|FORTH|FOURTHY|FOURT|FRTY|FROTH|FROUTH|FOUR|FIFTIETH|FIFTEENTH|FIFTY|FIFTHE|FIFTHY|FIFTH|FIFTEY|FIFT|FITY|FIFETH|FIFFTH|FIVE|SIXTIETH|SIXTEENTH|SIXTY|SXTY|SIXY|SIXTHY|SIXTEY|SIXTH|SXTH|SITH|SIHXT|SIX|SEVENTIETH|SEVENTY|SEVENTEENTH|SEVENTY|SEVENTEY|SVENTY|SEVENTI|SEVENTH|SVEN|SVENTH|SEVENH|SEVENT|SEVEN|EIGHTIETH|EIGHTETH|EIGHTEENTH|EIGHTY|EIGHTEH|EIGHTEY|EIGHTE|EIGHTH|EITH|EIGHT|EIGHTTH|EIGTH|FIGHT|UNITY|NINETIETH|NINTIETH|NINETEENTH|NINETY|NINETEY|NINETIETH|NINTH|NINTY|TENTH|ELEVENTH|TWELFTH|THIRTEENTH|TWENTIETH|TWENTIEFTH|THIRTIETH|THIRTIEFTH',
    'street_name': r'ALLANTIC|ATLASTA|ALLEM|ALLEN|CROTON|GROTON|AT RINS|BATTIE|BARREE|BESSHLEY|BIRY|B WAY|BWAY|BRAKSWAY|BUSTWICK|BUTTER|BREEVORT|BRENNEL|BROOMES|BROOM|BRANNAS|BROWN|BLACKER|BLENKER|CLAIR|CLISTEN|CHERY|CHRYSTEE|CHRYSTAL|CHTYSTIE|CHRYSTEL|CARRAL|COYAL|COLUMBIN|CAMNON|CROWH|DEVAL|DEBOUCHEL|DAFONT|DEGRAN|DENBO|DEKALB|DELAMERE|DALANEY|DELANEY|DELANCY|ELTHZROTH|ELLSWICK|ELDREDGE|CLARIDGE|ESSEY|FORSYTHE|FLATHISH|GLANCE|GOAST|GREENS|GREENRICH|GAMWICH|HOUTON|HAVES|HAKEY|HEWEY|HICK|HUMBOLOT|HUMBARD|HUMBOLT|JOHOM|KIOP|HEAP|LAY FAY ESTE|LADORATT|LAFYAYETTE|LIRA|LOUMOR|LARMER|LAAVIUK|LAIDLOW|TEX|LEX|REPPERTS|PARLE|MALLE|MYETTE|MC DOUGALL|MC DOUGAL|MCDOUGALL|MC DONOUGH|MANZA|MAREY|MESCOLE|MEASE|MEDDLER|MANGEN|HAULL|MALLERY|NAPOLK|VAST AND|DAK|OLWEN|GERHARD|PUTT|PERROTT|PERROTT PREMPONT|PLAD|PRUFER|PREDIDUNT|PALOKA|RUTHIE|RIDAL|RAYSON|REVENTON|RUALMAINE|RICER SIDE|REDERICK|RENNICK|SELLTOWN|SISH|STUCKER|STATES|SCHAALS|SUMME|SCHOMERDOSA|DOUTH|SONSE|STUYVESTANT|STONPSON|TRAY|TAYLER|WMON|WAR CAREN|VEMON|VANDERLY|VANDERSLIDE|WYONIA|WITKINS|WALLWORTH|WHIPPER|WALLABANK|WALKABOUT|WASH|WASTEWATER'
}


In [6]:
df['number'] = df['Street address 2'].str.extract('(\d+)')

df['number']

0        38
1       107
2        81
3       117
4        62
       ... 
1995    158
1996     78
1997      8
1998    NaN
1999    609
Name: number, Length: 2000, dtype: object

In [8]:
def search_direction(x):
    
    result = re.findall(pattern['direction'], x)
    return " ".join(result)

def search_end(x):
    result = re.findall(pattern['end'], x)
    return " ".join(result)

def search_street_type(x):
    result = re.findall(pattern['street_type'], x)
    return " ".join(result)

def search_word_number(x):
    result = re.findall(pattern['number'], x)
    return " ".join(result)

def search_street_name(x):
    result = re.findall(pattern['street_name'], x)
    return " ".join(result)

df['direction'] = df['Street address 2'].apply(lambda x : search_direction(x))
df['end'] = df['Street address 2'].apply(lambda x : search_end(x))
df['street_type'] = df['Street address 2'].apply(lambda x : search_street_type(x))
df['word_number'] = df['Street address 2'].apply(lambda x : search_word_number(x))
df['street_name'] = df['Street address 2'].apply(lambda x : search_street_name(x))



In [9]:
# m = 'WEST SIXTY FIFTH FORTH ST'

# pattern = r'FRIST|FRST|FIRST|ONE|ONE HUNDRED|ONEHUNDRED|HUNDRED|HUDRED|HUNDED|TWO HUNDRED|TWOHUNDRED|TWENTY|TWENTI|TENTI|SECOND|SECORD|SCOND|TWO|THIRTY|THIRTHY|THIRTEY|TIRTY|TRITHY|THRID|THIRD|TIRD|TRIH|THREE|FORTY|FORTH|FOURTHY|FOURT|FRTY|FROTH|FROUTH|FOUR|FIFTY|FIFTHE|FIFTHY|FIFTH|FIFTEY|FIFT|FITY|FIFETH|FIFFTH|FIVE|SIXTY|SXTY|SIXY|SIXTHY|SIXTEY|SIXTH|SXTH|SITH|SIHXT|SIX|SEVENT|SEVENTY|SEVENTEY|SVENTY|SEVENTI|SEVENTH|SVEN|SVENTH|SEVENH|SEVEN|EIGHTY|EIGHTEY|EIGHTE|EIGHTH|EIGHTEH|EITH|EIGHT|EIGHTTH|EIGTH|FIGHT|UNITY|NINTH|NINETY|NINETEY|NINETIETH|NINTY|TENTH|ELEVENTH|TWELFTH|THIRTEENTH|FORTEENTH|FIFTEENTH|SIXTEENTH|SEVENTEENTH|EIGHTEENTH|NINETEENTH|TWENTIETH|TWENTIEFTH|THIRTIETH|THIRTIEFTH|FORTIETH|FOURTIETH|FOURTHENTH|FIFTIETH|SIXTIETH|SEVENTIETH|SEVENTY|EIGHTIETH|EIGHTETH|NINETIETH|NINTIETH'
# # pattern = r'SIXTY|FIFTH|SEVENTH|FORT|FOURTHY'
# a = re.findall(pattern, m)
# a




In [10]:
print(df['direction'])
print(df['direction'].unique())

0           
1       EAST
2           
3       EAST
4       WEST
        ... 
1995    WEST
1996        
1997        
1998        
1999      E 
Name: direction, Length: 2000, dtype: object
['' 'EAST' 'WEST' 'SOUTH' ' E ' 'SOUTH WEST' 'WEST NORTH' 'SOUTH  S  WEST'
 'EAST EAST' ' E   E ']


In [11]:
print(df['end'])
print(df['end'].unique())

0        STREET
1        STREET
2        STREET
3        STREET
4        STREET
         ...   
1995     STREET
1996         ST
1997           
1998           
1999         ST
Name: end, Length: 2000, dtype: object
[' STREET' '' ' ST' ' STR' ' STE' 'STREET' ' STREE' ' REET']


In [12]:
print(df['street_type'])
print(df['street_type'].unique())

0        
1        
2        
3        
4        
       ..
1995     
1996     
1997     
1998     
1999     
Name: street_type, Length: 2000, dtype: object
['' ' PLACE' ' SQUARE' ' ROAD' ' LANE' ' CL' ' DRIVE' ' AL' ' TERRACE']


In [13]:
print(df['word_number'])
print(df['word_number'].unique())

0        
1        
2        
3        
4        
       ..
1995     
1996     
1997     
1998     
1999     
Name: word_number, Length: 2000, dtype: object
['' 'FIFTH' 'SIXTY FIFTH' 'NINTY FIRST' 'THIRD' 'SIXTY SECOND' 'NINTH'
 'SECOND' 'EIGHTH' 'SEVENTH' 'FIFTY FOURT' 'SIXTH' 'FIFTY FIFTH'
 'THIRTY THIRD' 'SIXTIETH' 'FIRST' 'ONE' 'TWENTY EIGHT' 'TENTH'
 'ONE HUNDRED SEVEN' 'FOURT' 'ONE TWENTY NINTH' 'FOURT NINTH' 'THIRTEENTH'
 'TWENTY THIRD' 'FIFTY THIRD' 'SIXTY THIRD' 'NINETY THIRD' 'SEVENTY THIRD'
 'TWENTY EIGHTH' 'ONE HUNDRED SEVENTEENTH' 'ONE HUNDRED THIRTY'
 'EIGHTY EIGHTH' 'THIRTY EIGHT' 'TWELFTH' 'EIGHTY NINTH' 'NINTY NINTH'
 'THIRTY FIRST' 'FIFTY SECOND' 'SEVENTIETH' 'THIRTY SEVENTH' 'FORTIETH'
 'FORTY FIRST' 'ONE HUNDRED FOURT' 'EIGTH' 'NINTY SIXTH'
 'ONE HUNDRED FIFTH' 'FIFTY SEVENTH' 'EIGHT' 'ELEVENTH' 'FIFTY FIRST'
 'ONE HUNDRED TWENTY FIFT' 'SEVENTY FOURT' 'FOURT FIFTH' 'SEVENTEENTH'
 'UNITY FIFTH' 'TWENTY SIXTH' 'ONE HUNDRED TWENTI' 'NINETY SIXTH'
 'ONE HUNDRED EIGHT' '

In [14]:
print(df['street_name'])
print(df['street_name'].unique())

0          
1          
2          
3          
4          
       ... 
1995       
1996       
1997       
1998    LEX
1999       
Name: street_name, Length: 2000, dtype: object
['' 'CHRYSTAL' 'LEX' 'WASH' 'BROOM' 'ALLEN' 'FORSYTHE' 'RENNICK' 'CHERY'
 'HOUTON' 'BLACKER']


In [15]:
df

Unnamed: 0,Record type,Standardized township (string),County 2,Enumeration district 2,Consistent historical data person identifier,Dwelling sequence number,Dwelling serial number,Dwelling serial number 2,Household sequence within dwelling,"Household sequence within dwelling, 8 digit",...,Number of families in household,"Number of person records in household, before large group quarters were split up (100% datasets)",House number,Street address 2,number,direction,end,street_type,word_number,street_name
0,H,MANHATTAN,610,1256,,,60496,661,38,60496,...,2,7,,38 STREET,38,,STREET,,,
1,H,MANHATTAN,610,339,,,1943060,21190,18,1943060,...,1,2,,EAST 107TH STREET,107,EAST,STREET,,,
2,H,MANHATTAN,610,1043,,,477699,5196,50,477699,...,1,5,337,81ST STREET,81,,STREET,,,
3,H,MANHATTAN,610,292,,,217548,2408,92,217548,...,2,4,,EAST 117TH STREET,117,EAST,STREET,,,
4,H,MANHATTAN,610,1318,,,739265,8103,25,739265,...,1,1,,WEST 62ND STREET,62,WEST,STREET,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,H,MANHATTAN,610,691,,,53635,585,9,53635,...,1,5,,WEST 158 STREET,158,WEST,STREET,,,
1996,H,MANHATTAN,610,1036,,,474555,5164,39,474555,...,1,3,,E 78TH ST,78,,ST,,,
1997,H,MANHATTAN,610,1202,,,511291,5578,53,511291,...,1,2,,8TH AVE,8,,,,,
1998,H,MANHATTAN,610,425,,,439107,4767,16,439107,...,1,4,,LEXINGTON AVE,,,,,,LEX


In [23]:
def search_direction(text):
    result = re.findall(r'WEST|EAST|NORTH|SOUTH|\sW\s|\sE\s|\sN\s|\sS\s', text)
    return " ".join(result)




In [7]:
x = 'WEST ONE HUNDRED AND SEVEN STREET'
a = re.findall('WEST', x)[0]
a

'WEST'

regular expression in R:   
- https://rstudio-pubs-static.s3.amazonaws.com/74603_76cd14d5983f47408fdf0b323550b846.html
- https://cran.r-project.org/web/packages/stringr/vignettes/regular-expressions.html
 
regular expression in Python:  
- https://www.bogotobogo.com/python/python_regularExpressions.php#:~:text=Well%2C%20%5CD%20matches%20any%20character,try%20to%20match%20different%20separators.
 
Quite useful regular expression extract web:  
- https://regex101.com/r/rzNgTL/1