In [1]:
#import the packages that needed
import pandas as pd
import numpy as np
import regex as re
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('census_1910_mn_small.csv')

In [3]:
df.columns

Index(['Record type', 'Standardized township (string)', 'County 2',
       'Enumeration district 2',
       'Consistent historical data person identifier',
       'Dwelling sequence number', 'Dwelling serial number',
       'Dwelling serial number 2', 'Household sequence within dwelling',
       'Household sequence within dwelling, 8 digit',
       'Household serial number 2',
       'Household serial number, before large group quarters were split up (100% datasets)',
       'Individual sequence number',
       'Large group quarters that was split up (100% datasets)', 'Line number',
       'Line number 2', 'Microfilm page number',
       'Number of families in household',
       'Number of person records in household, before large group quarters were split up  (100% datasets)',
       'House number', 'Street address 2'],
      dtype='object')

In [4]:
def spliter(x):
    
    pattern = re.compile(r'\d+')
    length = re.findall(pattern, x)
    if len(length) == 1:
        return x
    else:
        pattern_first = re.compile(r'^\d*\s')
        x = re.sub(pattern_first, "", x)
        return x

df['Street address 2'] = df['Street address 2'].apply(lambda x: spliter(x))

In [5]:
df['Street address 2']

0               38 STREET
1       EAST 107TH STREET
2             81ST STREET
3       EAST 117TH STREET
4        WEST 62ND STREET
              ...        
1995      WEST 158 STREET
1996            E 78TH ST
1997              8TH AVE
1998        LEXINGTON AVE
1999            E 14TH ST
Name: Street address 2, Length: 2000, dtype: object

In [6]:
def street_direction(x):
    # The orientations in the addresses
    pattern = re.compile(r'\sN\s|\sNORTH\s')
    x = re.sub(pattern, " N ", x)
    pattern = re.compile(r'\sNORTH$')
    x = re.sub(pattern, " N", x)
    pattern = re.compile(r'^NORTH\s')
    x = re.sub(pattern, "N ", x)
    pattern = re.compile(r'\sS\s|\sSOUTH\s')
    x = re.sub(pattern, " S ", x)
    pattern = re.compile(r'\sSOUTH$')
    x = re.sub(pattern, " S", x)
    pattern = re.compile(r'^SOUTH\s')
    x = re.sub(pattern, "S ", x)
    pattern = re.compile(r'\sE\s|\sEAST\s')
    x = re.sub(pattern, " E ", x)
    pattern = re.compile(r'^EAST$')
    x = re.sub(pattern, " E", x)
    pattern = re.compile(r'^EAST\s')
    x = re.sub(pattern, "E ", x)
    pattern = re.compile(r'\sW\s|\sWEST\s')
    x = re.sub(pattern, " W ", x)
    pattern = re.compile(r'^WEST$')
    x = re.sub(pattern, " W", x)
    pattern = re.compile(r'^WEST\s')
    x = re.sub(pattern, "W ", x)

    return x

df['street_direction'] = df['Street address 2'].apply(lambda x: street_direction(x))
       
    

In [7]:
df['street_direction']

0            38 STREET
1       E 107TH STREET
2          81ST STREET
3       E 117TH STREET
4        W 62ND STREET
             ...      
1995      W 158 STREET
1996         E 78TH ST
1997           8TH AVE
1998     LEXINGTON AVE
1999         E 14TH ST
Name: street_direction, Length: 2000, dtype: object

In [8]:
def street_type(x):
    pattern = re.compile(r'\sSTREET|\sSTR|\sSTE$|\sSRT$|\sSR$|\sSST$|\sSEET$|\sTREET$|\sSHEER$|\sSHEE$|\sSTREE$|\sSREET$|\sREET$|\sSTEE$|\sST$')
    x = re.sub(pattern, " ST", x)
    pattern = re.compile(r'\sDRIVE$|\sDRV$|\sDRI$|\sDRIV$|\sDRIE$|\sD.$')
    x = re.sub(pattern, " DR", x)
    pattern = re.compile(r'\sCIRCLE$|\sCIRCL$|\sCICLE$|\sCIRC$|\sCIR$|\sCRL$|\sC.$')
    x = re.sub(pattern, " CIR", x)
    pattern = re.compile(r'\sAVENUE$|\sAVENUE|\sAVENU$|\sAVEN$|\sAVE$|\sAVN$\sAV$')
    x = re.sub(pattern, " AVE", x)
    pattern = re.compile(r"\sCOURT$|\sCT$|\sCRT$|\sCTR$|\sCOUR$|<=\sCOT$|\sCORT$")
    x = re.sub(pattern, " CT", x)
    pattern = re.compile(r"\sBOULEVARD$|\sBVLD|\sBL.$|\sB.$")
    x = re.sub(pattern, " BLVD", x)
    pattern = re.compile(r"\sROAD$|\sRD$|\sRAD$|\sROD$")
    x = re.sub(pattern, " RD", x)
    pattern = re.compile(r"\sALLEY$|\sALY$|\sALEY$|\sALL.$|\sA.$")
    x = re.sub(pattern, " ALY", x)
    pattern = re.compile(r"\sPLACE$|\sPL.$|\sP.$|\sPLAC$|\sPLCE$|\sPCE$")
    x = re.sub(pattern, " PL", x)
    pattern = re.compile(r"\sPK$|\sPRK$|\sPRAK$|\sPAK$")
    x = re.sub(pattern, " PARK", x)
    pattern = re.compile(r"\sPARKWAY$|\sPKWY$|\sPARKW$|\sPWY$|\sPKW$|\sPRKWY$|\sPKW$")
    x = re.sub(pattern, " PKWY", x)
    pattern = re.compile(r"\sAPPROA$|\sAPRCH$|\sAPPRCH$|\sAPPR$|\sAPR$")
    x = re.sub(pattern, " APPROACH", x)
    pattern = re.compile(r"\sTERRACE$|\sTERR$|\sTER$|\sTRCE$|\sTRC$|\sTR$")
    x = re.sub(pattern, " TER", x)
    pattern = re.compile(r"\sPLAZA$|\sPLZA$|\sPLZ$|\sPLAZ$|\sPZ$")
    x = re.sub(pattern, " PLZ", x)
    pattern = re.compile(r"\sLANE$|\sLNE$|\sLN$|\sLAN$")
    x = re.sub(pattern, " LN", x)
    pattern = re.compile(r"\sBRIDGE$|\sBRGD$|\sBRG$|\sBGE$")
    x = re.sub(pattern, " BRG",x)
    pattern = re.compile(r"\sHILL$|\sHLL$|\sHL$|\sHIL$")
    x = re.sub(pattern, " HL", x)
    pattern = re.compile(r"\sHEIGHTS$|\sHTS$|\sHT$|\sHEGHTS$|\sHEIGHT$|\sHHT$|\sHEIGT$") 
    x = re.sub(pattern, " HTS", x)
    pattern = re.compile(r"\sSLP$|\sSLEP$|\sSLIIP$|\sSLI$")
    x = re.sub(pattern, " SLIP", x)
    pattern = re.compile(r"\sROOW$|\sRO.$|\sRW$")
    x = re.sub(pattern, " ROW", x)
    pattern = re.compile(r"\sSQUARE$") 
    x = re.sub(pattern, " SQ", x)

    return x

df['street_type'] = df['street_direction'].apply(lambda x: street_type(x))
    

In [9]:
df['street_type']

0               38 ST
1          E 107TH ST
2             81ST ST
3          E 117TH ST
4           W 62ND ST
            ...      
1995         W 158 ST
1996        E 78TH ST
1997          8TH AVE
1998    LEXINGTON AVE
1999        E 14TH ST
Name: street_type, Length: 2000, dtype: object

In [10]:
def street_number_name(x):

    pattern = re.compile(r"(\d+)(ST|ND|RD|TH)")
    x = re.sub(pattern, r"\1", x)
    pattern = re.compile(r"(?<=\s)ELEVENTH\b") 
    x = re.sub(pattern, "11", x)
    pattern = re.compile(r"(?<=\s)TWELFTH\b") 
    x = re.sub(pattern, "12", x)    
    pattern = re.compile(r"(?<=\s)THIRTEENTH\b") 
    x = re.sub(pattern, "13", x)     
    pattern = re.compile(r"(?<=\s)FORTEENTH\b|(?<=\s)FOURTHENTH\b") 
    x = re.sub(pattern, "14", x)    
    pattern = re.compile(r"(?<=\s)FIFTEENTH\b") 
    x = re.sub(pattern, "15", x)
    pattern = re.compile(r"(?<=\s)SIXTEENTH\b") 
    x = re.sub(pattern, "16", x)
    pattern = re.compile(r"(?<=\s)SEVENTEENTH\b") 
    x = re.sub(pattern, "17", x)
    pattern = re.compile(r"(?<=\s)EIGHTEENTH\b") 
    x = re.sub(pattern, "18", x)
    pattern = re.compile(r"(?<=\s)NINETEENTH\b") 
    x = re.sub(pattern, "19", x)
    pattern = re.compile(r"(?<=\s)TWENTIETH\b|(?<=\s)TWENTIEFTH\b") 
    x = re.sub(pattern, "20", x)
    pattern = re.compile(r"(?<=\s)THIRTIETH\b|(?<=\s)THIRTIEFTH\b") 
    x = re.sub(pattern, "30", x)
    pattern = re.compile(r"(?<=\s)FORTIETH\b|(?<=\s)FOURTIETH\b|(?<=\s)FOURTHENTH\b") 
    x = re.sub(pattern, "40", x)
    pattern = re.compile(r"(?<=\s)FIFTIETH\b") 
    x = re.sub(pattern, "50", x)
    pattern = re.compile(r"(?<=\s)SIXTIETH\b") 
    x = re.sub(pattern, "60", x)
    pattern = re.compile(r"(?<=\s)SEVENTIETH\b") 
    x = re.sub(pattern, "70", x)    
    pattern = re.compile(r"(?<=\s)EIGHTIETH\b|(?<=\s)EIGHTETH\b") 
    x = re.sub(pattern, "80", x) 
    pattern = re.compile(r"(?<=\s)NINETIETH\b|(?<=\s)NINTIETH\b") 
    x = re.sub(pattern, "90", x) 
    pattern = re.compile(r"(?<=\s)FRIST\b|(?<=\s)FRST\b|(?<=\s)FIRST\b|(?<=\s)ONE\b|(?<=\s)ONE HUNDRED\b|(?<=\s)ONEHUNDRED\b|(?<=\s)HUNDRED\b|(?<=\s)HUDRED\b|(?<=\s)HUNDED\b") 
    x = re.sub(pattern, "1", x) 
    pattern = re.compile(r"(?<=\s)TWO HUNDRED\b|(?<=\s)TWOHUNDRED\b|(?<=\s)TWENTY\b|(?<=\s)TWENTI\b|(?<=\s)TENTI\b|(?<=\s)SECOND\b|(?<=\s)SECORD\b|(?<=\s)SCOND\b|(?<=\s)TWO\b") 
    x = re.sub(pattern, "2", x)
    pattern = re.compile(r"(?<=\s)THIRTY\b|(?<=\s)THIRTHY\b|(?<=\s)THIRTEY\b|(?<=\s)TIRTY\b|(?<=\s)TRITHY\b|(?<=\s)THRID\b|(?<=\s)THIRD\b|(?<=\s)TIRD\b|(?<=\s)TRIH\b|(?<=\s)THREE\b") 
    x = re.sub(pattern, "3", x)    
    pattern = re.compile(r"(?<=\s)FORTY\b|(?<=\s)FORTH\b|(?<=\s)FOURTHY\b|(?<=\s)FOURT\b|(?<=\s)FRTY\b|(?<=\s)FROTH\b|(?<=\s)FROUTH\b|(?<=\s)FOUR\b") 
    x = re.sub(pattern, "4", x)
    pattern = re.compile(r"(?<=\s)FIFTY\b|(?<=\s)FIFTHE\b|(?<=\s)FIFTHY\b|(?<=\s)FIFTH\b|(?<=\s)FIFTEY\b|(?<=\s)FIFT\b|(?<=\s)FITY\b|(?<=\s)FIFETH\b|(?<=\s)FIFFTH\b|(?<=\s)FIVE\b") 
    x = re.sub(pattern, "5", x)
    pattern = re.compile(r"(?<=\s)SIXTY\b|(?<=\s)SXTY\b|(?<=\s)SIXY\b|(?<=\s)SIXTHY\b|(?<=\s)SIXTEY\b|(?<=\s)SIXTH\b|(?<=\s)SXTH\b|(?<=\s)SITH\b|(?<=\s)SIHXT\b|(?<=\s)SIX\b") 
    x = re.sub(pattern, "6", x)
    pattern = re.compile(r"(?<=\s)SEVENTEY\b|(?<=\s)SVENTY\b|(?<=\s)SEVENTI\b|(?<=\s)SEVENTH\b|(?<=\s)SEVENTY\b|(?<=\s)SVEN\b|(?<=\s)SVENTH\b|(?<=\s)SEVENH\b|(?<=\s)SEVENT\b|(?<=\s)SEVEN\b") 
    x = re.sub(pattern, "7", x) 
    pattern = re.compile(r"(?<=\s)EIGHTY\b|(?<=\s)EIGHTEH\b|(?<=\s)EIGHTEY\b|(?<=\s)EIGHTE\b|(?<=\s)EIGHTH\b|(?<=\s)EITH\b|(?<=\s)EIGHT\b|(?<=\s)EIGHTTH\b|(?<=\s)EIGTH\b|(?<=\s)FIGHT\b") 
    x = re.sub(pattern, "8", x)       
    pattern = re.compile(r"(?<=\s)UNITY\b|(?<=\s)NINETY\b|(?<=\s)NINETEY\b|(?<=\s)NINETIETH\b|(?<=\s)NINTH\b|(?<=\s)NINTY\b") 
    x = re.sub(pattern, "9", x) 
    pattern = re.compile(r"(?<=\s)TENTH\b") 
    x = re.sub(pattern, "10", x)     

    return x

df['street_number_name'] = df['street_type'].apply(lambda x: street_number_name(x))


In [11]:
df['street_number_name']

0               38 ST
1            E 107 ST
2               81 ST
3            E 117 ST
4             W 62 ST
            ...      
1995         W 158 ST
1996          E 78 ST
1997            8 AVE
1998    LEXINGTON AVE
1999          E 14 ST
Name: street_number_name, Length: 2000, dtype: object

In [12]:
def street_name(x):
    pattern = re.compile(r'(?<=\s)ALLANTIC\s|(?<=\s)ATLASTA\s')
    x = re.sub(pattern, "ATLANTIC", x)
    pattern = re.compile(r'(?<=\s)ALLEM\s')
    x = re.sub(pattern, "ALLEN", x)    

    pattern = re.compile(r'(?<=\s)CROTON\s|(?<=\s)GROTON\s')
    x = re.sub(pattern, "AUDUBON", x)    
    pattern = re.compile(r'(?<=\s)(AT RINS)\s')
    x = re.sub(pattern, "ATKINS", x)  
    pattern = re.compile(r'(?<=\s)AMSTERDAM\s')
    x = re.sub(pattern, "AMSTERDAM", x) 

    pattern = re.compile(r'(?<=\s)BATTIE\s')
    x = re.sub(pattern, "BALTIC", x)      
    pattern = re.compile(r'(?<=\s)BARREE\s')
    x = re.sub(pattern, "BARROW", x)   
    pattern = re.compile(r'(?<=\s)BESSHLEY\s')
    x = re.sub(pattern, "BURLING", x)       
    pattern = re.compile(r'(?<=\s)BIRY\s|(?<=\s)(B WAY)\s|(?<=\s)BWAY\s|(?<=\s)BRAKSWAY\s')
    x = re.sub(pattern, "BROADWAY", x)         
    pattern = re.compile(r'(?<=\s)BUSTWICK\s')
    x = re.sub(pattern, "BUSHWICK", x) 
    pattern = re.compile(r'(?<=\s)BUTTER\s')
    x = re.sub(pattern, "BUTLER", x) 
    pattern = re.compile(r'(?<=\s)BREEVORT\s')
    x = re.sub(pattern, "BREVOORT", x) 
    pattern = re.compile(r'(?<=\s)BRENNEL\s|(?<=\s)BROOMES\s|(?<=\s)BROOM\s|(?<=\s)BRANNAS\s|(?<=\s)BROWN\s')
    x = re.sub(pattern, "BROOME", x) 
    pattern = re.compile(r'(?<=\s)BLACKER\s|(?<=\s)BLENKER\s')
    x = re.sub(pattern, "BLEECKER", x) 

    pattern = re.compile(r'(?<=\s)CLAIR\s')
    x = re.sub(pattern, "CLASSON", x) 
    pattern = re.compile(r'(?<=\s)CLISTEN\s')
    x = re.sub(pattern, "CLINTON", x)     
    pattern = re.compile(r'(?<=\s)CHERY\s')
    x = re.sub(pattern, "CHERRY", x) 
    pattern = re.compile(r'(?<=\s)CHRYSTEE\s|(?<=\s)CHRYSTAL\s|(?<=\s)CHTYSTIE\s|(?<=\s)CHRYSTEL\s')
    x = re.sub(pattern, "CHRYSTIE", x)    
#         pattern = re.compile(r'(?<=\s)CENTRAL PARK\s')
#         x = re.sub(pattern, "CENTRAL PARK", x) 
    pattern = re.compile(r'(?<=\s)CENTRAL PARK\s')
    x = re.sub(pattern, "CENTRAL PARK", x) 
    pattern = re.compile(r'(?<=\s)CARRAL\s|(?<=\s)COYAL\s')
    x = re.sub(pattern, "CANAL", x)    
    pattern = re.compile(r'(?<=\s)COLUMBIN\s')
    x = re.sub(pattern, "COLUMBIA", x) 
    pattern = re.compile(r'(?<=\s)CAMNON\s')
    x = re.sub(pattern, "CANNON", x) 
    pattern = re.compile(r'(?<=\s)CROWH\s')
    x = re.sub(pattern, "CROWN", x) 

    pattern = re.compile(r'(?<=\s)DEVAL\s')
    x = re.sub(pattern, "DEVOE", x)
    pattern = re.compile(r'(?<=\s)DEBOUCHEL\s')
    x = re.sub(pattern, "DEBEVOISE",x)
    pattern = re.compile(r'(?<=\s)DAFONT\s')
    x = re.sub(pattern, "DUPONT", x)
    pattern = re.compile(r'(?<=\s)DEGRAN\s')
    x = re.sub(pattern, "DEGRAW", x)
    pattern = re.compile(r'(?<=\s)DENBO\s|(?<=\s)DEKALB\s')
    x = re.sub(pattern, "DE KALB", x)
    pattern = re.compile(r'(?<=\s)DELAMERE\s|(?<=\s)DALANEY\s|(?<=\s)DELANEY\s|(?<=\s)DELANCY\s')
    x = re.sub(pattern, "DELANCEY", x) 

    pattern = re.compile(r'(?<=\s)ELTHZROTH\s|(?<=\s)ELLSWICK\s')
    x = re.sub(pattern, "ELLIOTT", x)
    pattern = re.compile(r'(?<=\s)ELDREDGE\s|(?<=\s)CLARIDGE\s')
    x = re.sub(pattern, "ELDRIDGE", x) 
    pattern = re.compile(r'(?<=\s)ESSEY\s')
    x = re.sub(pattern, "ESSEX", x) 

    pattern = re.compile(r'(?<=\s)FORSYTHE\s')
    x = re.sub(pattern, "FORSYTH", x) 
    pattern = re.compile(r'(?<=\s)FLATHISH\s')
    x = re.sub(pattern, "FLATBUSH", x)

    pattern = re.compile(r'(?<=\s)GLANCE\s')
    x = re.sub(pattern, "GRAND", x) 
    pattern = re.compile(r'(?<=\s)GOAST\s')
    x = re.sub(pattern, "GOERCK", x)
    pattern = re.compile(r'(?<=\s)GREENS\s')
    x = re.sub(pattern, "GREENE", x)
    pattern = re.compile(r'(?<=\s)GREENRICH\s|(?<=\s)GAMWICH\s')
    x = re.sub(pattern, "GREENWICH", x) 

    pattern = re.compile(r'(?<=\s)HOUTON\s')
    x = re.sub(pattern,  "HOUSTON", x) 
    pattern = re.compile(r'(?<=\s)HAVES\s')
    x = re.sub(pattern, "HEWES", x) 
    pattern = re.compile(r'(?<=\s)HAKEY\s')
    x = re.sub(pattern, "HALSEY", x)
    pattern = re.compile(r'(?<=\s)HEWEY\s')
    x = re.sub(pattern, "HENRY", x)
    pattern = re.compile(r'(?<=\s)HICK\s')
    x = re.sub(pattern, "HICKS", x)
    pattern = re.compile(r'(?<=\s)HUMBOLOT\s|(?<=\s)HUMBARD\s|(?<=\s)HUMBOLT\s')
    x = re.sub(pattern, "HUMBOLDT", x) 

    pattern = re.compile(r'(?<=\s)JOHOM\s')
    x = re.sub(pattern, "JOHNS", x) 

    pattern = re.compile(r'(?<=\s)KIOP\s|(?<=\s)HEAP\s')
    x = re.sub(pattern, "KEAP", x) 

    pattern = re.compile(r'(?<=\s)(LAY FAY ESTE)\s|(?<=\s)LADORATT\s|(?<=\s)LAFYAYETTE\s')
    x = re.sub(pattern, "LAFAYETTE", x)
    pattern = re.compile(r'(?<=\s)LIRA\s|(?<=\s)LOUMOR\s|(?<=\s)LARMER\s')
    x = re.sub(pattern, "LORIMER", x)
    pattern = re.compile(r'(?<=\s)LAAVIUK\s')
    x = re.sub(pattern, "LAWRENCE", x) 
    pattern = re.compile(r'(?<=\s)LAIDLOW\s')
    x = re.sub(pattern, "LUDLOW", x) 
    pattern = re.compile(r'(?<=\s)TEX|LEX\s') # perl = True
    x = re.sub(pattern, "LEXINGTON", x)   
    pattern = re.compile(r'(?<=\s)REPPERTS\s')
    x = re.sub(pattern, "LEFFERTS", x)

    pattern = re.compile(r'(?<=\s)PARLE\s|(?<=\s)MALLE\s|(?<=\s)MYETTE\s')
    x = re.sub(pattern, "MYRTLE", x)
    pattern = re.compile(r'(?<=\s)(MC DOUGALL)\s|(?<=\s)(MC DOUGAL)\s|(?<=\s)MCDOUGALL\s')
    x = re.sub(pattern, "MCDOUGAL", x)
    pattern = re.compile(r'(?<=\s)(MC DONOUGH)\s')
    x = re.sub(pattern, "MCDONOUGH", x)
    pattern = re.compile(r'(?<=\s)MANZA\s|(?<=\s)MAREY\s')
    x = re.sub(pattern, "MARCY", x)
    pattern = re.compile(r'(?<=\s)MADISON\s')
    x = re.sub(pattern, "MADISON", x)
    pattern = re.compile(r'(?<=\s)MESCOLE\s')
    x = re.sub(pattern, "MESEROLE", x)
    pattern = re.compile(r'(?<=\s)MEASE\s')
    x = re.sub(pattern, "MOORE", x)
    pattern = re.compile(r'(?<=\s)MEDDLER\s')
    x = re.sub(pattern, "MIDDLETON", x)
    pattern = re.compile(r'(?<=\s)MANGEN\s')
    x = re.sub(pattern, "MANGIN", x)
    pattern = re.compile(r'(?<=\s)HAULL\s|(?<=\s)MALLERY\s')
    x = re.sub(pattern, "MULBERRY", x)

    pattern = re.compile(r'(?<=\s)NAPOLK\s')
    x = re.sub(pattern, "NORFOLK", x)
    pattern = re.compile(r'(?<=\s)(VAST AND)\s')
    x = re.sub(pattern, "NOSTRAND", x)

    pattern = re.compile(r'(?<=\s)DAK\s')
    x = re.sub(pattern, "OAK", x)
    pattern = re.compile(r'(?<=\s)OLWEN\s')
    x = re.sub(pattern, "OLIVER", x)
    pattern = re.compile(r'(?<=\s)GERHARD\s')
    x = re.sub(pattern, "ORCHARD", x)

    pattern = re.compile(r'(?<=\s)PUTT\s')
    x = re.sub(pattern, "PITT", x)
    pattern = re.compile(r'(?<=\s)PERROTT\s|(?<=\s)(PERROTT PREMPONT)\s')
    x = re.sub(pattern, "PIERREPONT", x)
    pattern = re.compile(r'(?<=\s)PLAD\s')
    x = re.sub(pattern, "PLACE", x)
    pattern = re.compile(r'(?<=\s)PRUFER\s')
    x = re.sub(pattern, "PROSPECT", x)
    pattern = re.compile(r'(?<=\s)PREDIDUNT\s')
    x = re.sub(pattern, "PRESIDENT", x)
    pattern = re.compile(r'(?<=\s)PALOKA\s')
    x = re.sub(pattern, "PULASKI", x)

    pattern = re.compile(r'(?<=\s)RUTHIE\s')
    x = re.sub(pattern, "RUTLEDGE", x)
    pattern = re.compile(r'(?<=\s)RIDAL\s')
    x = re.sub(pattern, "RIDGE", x)
    pattern = re.compile(r'(?<=\s)RAYSON\s')
    x = re.sub(pattern, "RYERSON", x)
    pattern = re.compile(r'(?<=\s)REVENTON\s')
    x = re.sub(pattern, "RIVINGTON", x)
    pattern = re.compile(r'(?<=\s)RUALMAINE\s|(?<=\s)(RICER SIDE)\s')
    x = re.sub(pattern, "RIVERSIDE", x)
    pattern = re.compile(r'(?<=\s)REDERICK\s|(?<=\s)RENNICK\s')
    x = re.sub(pattern, "RENWICK", x)

    pattern = re.compile(r'(?<=\s)SELLTOWN\s')
    x = re.sub(pattern, "SULLIVAN", x)
    pattern = re.compile(r'(?<=\s)SISH\s')
    x = re.sub(pattern, "SIDE", x)
    pattern = re.compile(r'(?<=\s)STUCKER\s')
    x = re.sub(pattern, "STEUBEN", x)
    pattern = re.compile(r'(?<=\s)STATES\s')
    x = re.sub(pattern, "STATE", x)
    pattern = re.compile(r'(?<=\s)SCHAALS\s')
    x = re.sub(pattern, "SCHOLES", x)
    pattern = re.compile(r'(?<=\s)SUMME\s')
    x = re.sub(pattern, "SUMMIT", x)
    pattern = re.compile(r'(?<=\s)SCHOMERDOSA\s')
    x = re.sub(pattern, "SCHERMERHORN", x)
    pattern = re.compile(r'(?<=\s)DOUTH\s|(?<=\s)SONSE\s')
    x = re.sub(pattern, "SOUTH", x)
    pattern = re.compile(r'(?<=\s)STUYVESTANT\s')
    x = re.sub(pattern, "STUYVESANT", x)

    pattern = re.compile(r'(?<=\s)STONPSON\s')
    x = re.sub(pattern, "THOMPSON", x)
    pattern = re.compile(r'(?<=\s)TRAY\s')
    x = re.sub(pattern, "TROY", x)
    pattern = re.compile(r'(?<=\s)TAYLER\s')
    x = re.sub(pattern, "TAYLOR", x)

    pattern = re.compile(r'(?<=\s)WMON\s')
    x = re.sub(pattern, "UNION", x)

    pattern = re.compile(r'(?<=\s)(WAR CAREN)\s')
    x = re.sub(pattern, "VAN BUREN", x)
    pattern = re.compile(r'(?<=\s)VEMON\s')
    x = re.sub(pattern, "VERNON", x)
    pattern = re.compile(r'(?<=\s)VANDERLY\s|(?<=\s)VANDERSLIDE\s')
    x = re.sub(pattern, "VANDERBILT", x)

    pattern = re.compile(r'(?<=\s)WYONIA\s')
    x = re.sub(pattern, "WYONA", x)
    pattern = re.compile(r'(?<=\s)WITKINS\s')
    x = re.sub(pattern, "WATKINS", x)
    pattern = re.compile(r'(?<=\s)WALLWORTH\s')
    x = re.sub(pattern, "WALWORTH", x)
    pattern = re.compile(r'(?<=\s)WHIPPER\s')
    x = re.sub(pattern, "WHIPPLE", x)
    pattern = re.compile(r'(?<=\s)WALLABANK|(?<=\s)WALKABOUT\s')
    x = re.sub(pattern, "WALLABOUT", x)
    pattern = re.compile(r'(?<=\s)WASH\s|(?<=\s)WASTEWATER\s')
    x = re.sub(pattern, "WASHINGTON", x) 

    return x

df['street_name'] = df['street_number_name'].apply(lambda x: street_name(x))


In [13]:
df['street_name']

0               38 ST
1            E 107 ST
2               81 ST
3            E 117 ST
4             W 62 ST
            ...      
1995         W 158 ST
1996          E 78 ST
1997            8 AVE
1998    LEXINGTON AVE
1999          E 14 ST
Name: street_name, Length: 2000, dtype: object

regular expression in R:   
- https://rstudio-pubs-static.s3.amazonaws.com/74603_76cd14d5983f47408fdf0b323550b846.html
- https://cran.r-project.org/web/packages/stringr/vignettes/regular-expressions.html
 
regular expression in Python:  
- https://www.bogotobogo.com/python/python_regularExpressions.php#:~:text=Well%2C%20%5CD%20matches%20any%20character,try%20to%20match%20different%20separators.
 
Quite useful regular expression extract web:  
- https://regex101.com/r/rzNgTL/1

In [14]:
# df.to_csv('census_1910_mn_small_clean_Python_final_v03.csv', index = False)