In [1]:
#import the packages that needed
import pandas as pd
import numpy as np
import regex as re
from special_cases import special_character
from street_direction import street_direction
from street_name import street_name
from street_number_name import street_number_name
from hn_street_split import split
from street_type import street_type

import warnings
warnings.filterwarnings('ignore')

In [2]:
def clean(df, column):  
        
#         Clean the street name and return the final clean addresses
    df[column] = df[column].apply(lambda x: str(x).upper())
    df['street_direction_clean'] = df[column].apply(lambda x: street_direction(str(x)))
    df['street_type_clean'] = df['street_direction_clean'].apply(lambda x: street_type(x))
    
    df['street_without_hn'] = df['street_type_clean'].apply(lambda x: split(str(x)))

    df['street_number_name_clean'] = df['street_without_hn'].apply(lambda x: street_number_name(x))
    df['street_name_clean'] = df['street_number_name_clean'].apply(lambda x: street_name(x))
    df['final_clean_address'] = df['street_name_clean'].apply(lambda x: special_character(x))
    
    df['final_clean_address'] = np.where(df['final_clean_address'] == 'ST', df['street_type_clean'], df['final_clean_address'])
    
#         Extract different components of the addresses, e.g direction: W/E/S/N; street type: AVE/PL/ST/CIR, etc.
    pattern_street_direction = r'(?<=\s)[N|W|S|E]\s|^[N|W|S|E]\s|(?<=\s)[N|W|S|E]\d+'
    df['street_direction'] = df['final_clean_address'].apply(lambda x: ' '.join(re.findall(pattern_street_direction, x)))

    pattern_street_type = r'(?<=\s)ST$|(?<=\s)DR$|(?<=\s)CIR$|(?<=\s)AVE$|(?<=\s)CT$|(?<=\s)BLVD$|(?<=\s)ALY$|(?<=\s)PLZ$|(?<=\s)PARK$|(?<=\s)PKWY$|(?<=\s)APPROACH$|(?<=\s)TER$|(?<=\s)PL$|(?<=\s)LN$|(?<=\s)BRG$|(?<=\s)HL$|(?<=\s)HTS$|(?<=\s)SLIP$|(?<=\s)ROW$|(?<=\s)SQ$'
    df['street_type'] = df['final_clean_address'].apply(lambda x: ' '.join(re.findall(pattern_street_type, x)))

    df['street_name1'] = df.apply(lambda x: x["final_clean_address"].replace(x["street_type"], "").strip(), axis=1)
    df['street_name'] = df.apply(lambda x: x["street_name1"].replace(x["street_direction"], "").strip(), axis=1)
    df.drop('street_name1', axis=1, inplace = True)

    return df[[column, 'street_direction', 'street_name', 'street_type', 'final_clean_address']]


In [3]:
census_1910_h_mn_10k = pd.read_csv('census_1910_h_mn_10k.csv')
census_1910_h_mn_10k.columns

Index(['Record type', 'Standardized township (string)', 'County 2',
       'Enumeration district 2',
       'Consistent historical data person identifier',
       'Dwelling sequence number', 'Dwelling serial number',
       'Dwelling serial number 2', 'Household sequence within dwelling',
       'Household sequence within dwelling, 8 digit',
       'Household serial number 2',
       'Household serial number, before large group quarters were split up (100% datasets)',
       'Individual sequence number',
       'Large group quarters that was split up (100% datasets)', 'Line number',
       'Line number 2', 'Microfilm page number',
       'Number of families in household',
       'Number of person records in household, before large group quarters were split up  (100% datasets)',
       'House number', 'Street address 2'],
      dtype='object')

In [4]:
census_1880_h_mn_10k = pd.read_csv('census_1880_h_mn_10k.csv')
census_1880_h_mn_10k.columns

Index(['rectype', 'year', 'serial', 'dwsize', 'city', 'pageno', 'microseq',
       'nfams', 'splithid', 'splitnum', 'mcd', 'county', 'enumdist', 'supdist',
       'street'],
      dtype='object')

In [5]:
new_df_mn_1910 = clean(census_1910_h_mn_10k, 'Street address 2')

In [6]:
new_df_mn_1910

Unnamed: 0,Street address 2,street_direction,street_name,street_type,final_clean_address
0,38 STREET,,38,ST,38 ST
1,EAST 107TH STREET,E,107,ST,E 107 ST
2,81ST STREET,,81,ST,81 ST
3,EAST 117TH STREET,E,117,ST,E 117 ST
4,WEST 62ND STREET,W,62,ST,W 62 ST
...,...,...,...,...,...
9995,BROADWAY BET 85886 ST BRETTON HA,,BROADWAY BET 85886 ST BRETTON HA,,BROADWAY BET 85886 ST BRETTON HA
9996,49TH STREET,,49,ST,49 ST
9997,WEST EIGHTEENTH STREET,W,18,ST,W 18 ST
9998,EAST 13TH ST,E,13,ST,E 13 ST


In [7]:
new_df_mn_1880 = clean(census_1880_h_mn_10k, 'street')

In [8]:
new_df_mn_1880

Unnamed: 0,street,street_direction,street_name,street_type,final_clean_address
0,310 E HOUSTON ST,E,HOUON,ST,E HOUSTON ST
1,1168 2ND AV,,2,AVE,2 AVE
2,313 60TH ST,,60,ST,60 ST
3,632 HUDSON ST,,HUDSON,ST,HUDSON ST
4,56 CENTRE ST,,CENTRE,ST,CENTRE ST
...,...,...,...,...,...
9995,400R 8TH ST,,8,ST,8 ST
9996,88 ANN ST,,ANN,ST,ANN ST
9997,171 4TH ST,,4,ST,4 ST
9998,415 E 10TH ST,E,10,ST,E 10 ST


In [9]:
new_df_mn_1910.to_csv('new_df_mn_1910_10k.csv', index = False)

In [10]:
new_df_mn_1880.to_csv('new_df_mn_1880_10k.csv', index = False)