# 1898 Restaurant Data
Goal: Create a set of uniform code for cleaning up the street names and street numbers. 


In [1]:
import pandas as pd

## Master

In [2]:
DF = pd.read_csv('datasets/1880_streets_full.csv')
DF.head(1)

Unnamed: 0,street_name,odd_on,avg_direction,building_num_range,start_end_coordinates,segment_length,segment_direction,building_num_range_length,avg_length_per_building,direction_deviation,road_type,offset_from_road_center
0,10th Avenue,left,-170.561731,"[1, 19]","[[40.741073, -74.009227], [40.739952, -74.0094...",126.194091,-170.561731,19,6.641794,0.0,Avenue,10


In [3]:
DF['street_name'].value_counts()

10th Avenue         164
8th Avenue          148
5th Avenue          129
2nd Avenue          129
3rd Avenue          126
                   ... 
Cornelia Street       1
Jackson Place         1
Hamilton Street       1
7th Street Place      1
Winthrop Place        1
Name: street_name, Length: 666, dtype: int64

In [4]:
master_list = list(dict(DF['street_name'].value_counts()).keys())
#sorted(master_list)

## 1898

In [5]:
df = pd.read_csv('datasets/1898TrowBus.csv')
df.head(1)

Unnamed: 0,Year,PG,ID,ID2,CAT_ORG,CAT_MOD,FULL,Position,Nation,Name,...,Arrival / I Record,CORNER,Address2,ST combi,ST Name,ST Name 2,ST Name 3,ST Name 4,ST Name 5,ST Name 6
0,1898_TrowBusMan,951,75,1898_TrowBusMan_75,Restaurants (Marked thus + are oyster saloons.),Restaurant,Abbott Charles W. 548 W 39th,18,,Abbott Charles W,...,,N,548,W 39th,W,39th,,,,


In [6]:
df.shape

(1994, 37)

In [7]:
df.columns

Index(['Year', 'PG', 'ID', 'ID2', 'CAT_ORG', 'CAT_MOD', 'FULL', 'Position',
       'Nation', 'Name', 'Addy_Clean', 'Address',
       'Nationality - Katherine Vote', 'Nationality - Zehra Vote',
       'Nationality - Jessica', 'Nationality Final', 'api_result1',
       'api_result2', 'nationality_api', 'Gender - Katherine Vote',
       'Gender - Zehra Vote', 'Gender - Jessica', 'Gender - Final',
       'Gender_api', 'gender_combined', 'gender_mary', 'Notes',
       'Arrival / I Record', 'CORNER', 'Address2', 'ST combi', 'ST Name',
       'ST Name 2', 'ST Name 3', 'ST Name 4', 'ST Name 5', 'ST Name 6'],
      dtype='object')

Get the columns I need.

In [8]:
df = df[['FULL', 'Name', 'Addy_Clean', 'ST combi']]
df.head()

Unnamed: 0,FULL,Name,Addy_Clean,ST combi
0,Abbott Charles W. 548 W 39th,Abbott Charles W,548 W 39th,W 39th
1,"Abeles Aaron, 113 Eldridge",Abeles Aaron,113 Eldridge,Eldridge
2,"Abelmont Eugenia, 486, 6th av",Abelmont Eugenia,486 6th av,6th av
3,"Abrahams Anna, 292 Cherry",Abrahams Anna,292 Cherry,Cherry
4,"Abramowitz Louis, 155 Allen",Abramowitz Louis,155 Allen st,Allen st


In [9]:
df.columns = ['full', 'name', 'address', 'street']
df.head(1)

Unnamed: 0,full,name,address,street
0,Abbott Charles W. 548 W 39th,Abbott Charles W,548 W 39th,W 39th


In [10]:
#df['address'].str.split().str[0]

In [11]:
df['no'] = df['address'].str.split().str[0]
df.head()

Unnamed: 0,full,name,address,street,no
0,Abbott Charles W. 548 W 39th,Abbott Charles W,548 W 39th,W 39th,548
1,"Abeles Aaron, 113 Eldridge",Abeles Aaron,113 Eldridge,Eldridge,113
2,"Abelmont Eugenia, 486, 6th av",Abelmont Eugenia,486 6th av,6th av,486
3,"Abrahams Anna, 292 Cherry",Abrahams Anna,292 Cherry,Cherry,292
4,"Abramowitz Louis, 155 Allen",Abramowitz Louis,155 Allen st,Allen st,155


### Street

### systematic cleaning

#### strip unnecessary white spaces

In [12]:
df['street_clean'] = df['street'].str.strip()
df.head(1)

Unnamed: 0,full,name,address,street,no,street_clean
0,Abbott Charles W. 548 W 39th,Abbott Charles W,548 W 39th,W 39th,548,W 39th


Inspect `street_clean` data.

In [13]:
st = dict(df['street_clean'].value_counts())
#sorted(st.items())

#### add "street" after "E xxth" or "W xxth"

In [14]:
df['street_clean'] = df['street_clean'].apply(
    lambda x: x+' Street' if x.startswith('E ') | x.startswith('W ') else x)
df.head(1)

Unnamed: 0,full,name,address,street,no,street_clean
0,Abbott Charles W. 548 W 39th,Abbott Charles W,548 W 39th,W 39th,548,W 39th Street


In [15]:
st = dict(df['street_clean'].value_counts())
#sorted(st.items())

#### replace abbreviation with full names.

In [16]:
df['street_clean'] = df['street_clean'].apply(
    lambda x: x.replace(' st', ' Street') if x.endswith(' st') else x)

In [17]:
df['street_clean'] = df['street_clean'].apply(
    lambda x: x.replace(' av', ' Avenue') if x.endswith(' av') else x)

In [18]:
df['street_clean'] = df['street_clean'].apply(
    lambda x: x.replace(' pl', ' Place') if x.endswith(' pl') else x)

In [19]:
df['street_clean'] = df['street_clean'].apply(
    lambda x: x.replace(' mkt', ' Market') if x.endswith(' mkt') else x)

In [20]:
df['street_clean'] = df['street_clean'].apply(
    lambda x: x.replace(' ln', ' Lane') if x.endswith(' ln') else x)

In [21]:
df['street_clean'] = df['street_clean'].apply(
    lambda x: x.replace(' sq', ' Square') if x.endswith(' sq') else x)

In [22]:
df['street_clean'] = df['street_clean'].apply(
    lambda x: x.replace(' slip', ' Slip') if x.endswith(' slip') else x)

In [23]:
st = dict(df['street_clean'].value_counts())
#sorted(st.items())

#### add "street" to all the ones with only one word

In [24]:
df['street_clean'] = df['street_clean'].apply(
    lambda x: x+' Street' if len(x.split())==1 else x)

In [25]:
st = dict(df['street_clean'].value_counts())
#sorted(st.items())

#### change to title case

### cross check results

In [26]:
streets_list = list(dict(df['street_clean'].value_counts()).keys())
streets_list[:5]

['3rd Avenue', 'Broadway Street', '2nd Avenue', '1st Avenue', '6th Avenue']

In [27]:
for street in streets_list:
    if street not in master_list:
        print(street)

Broadway Street
E Houston Street
W Broadway Street

Park row
Amsterdam Avenue
Columbus Avenue
Av A
W Houston Street
E Broadway Street
Fulton Market
Av C
Av B
Chatham Square
5th Street
St Mark’s Place
Clinton Market
N Moore
2nd Street
1st Street
Lincoln Avenue
Tompkins Market
8th Street
Trinity Place
Macdougal Street
av c W
S William
Washington Market
Union sq E
Tremont Avenue
6th Street
Willis Avenue
7th Street
3 Avenue
E 145th Street
8d Avenue
av n E
c 3rd Avenue
Av D
Stone William
Jefferson Market
Jerome Avenue
W End av Street
New Chambers
Little W 12th
N William
av n W
Bible h
Centre Market
9th Street
Brook Avenue
Grand st Circle
VandeWater Street
Hanover Square
Catherine Market


### manual cleaning

In [28]:
STREET = {
    # street added wrongly
    '1st Street': '1st Avenue',
    '2nd Street': '2nd Avenue',
    '3 Avenue': '3rd Avenue',
    'c 3rd Avenue': '3rd Avenue',
    '5th Street': '5th Avenue',
    '6th Street': '6th Avenue',
    '7th Street': '7th Avenue',
    '8th Street': '8th Avenue',
    '8d Avenue': '8th Avenue',
    '9th Street': '9th Avenue',
    'Broadway Street': 'Broadway',
    
    # direction order
    'W Broadway Street': 'Broadway W',
    'E Broadway Street': 'Broadway E',
    'W Houston Street': 'Houston Street W',
    'E Houston Street': 'Houston Street E',
    'N Moore': 'Moore Street N',
    'S William': 'William Street S',
    'N William': 'William Street N',
    'Union sq E': 'Union Square E',
    'Little W 12th': 'W 12th Street',
    'E 145th Street': 'E 145th Street',
    
    # title case
    'Park row': 'Park Row',
    'James slip': 'James Slip',
    'Old slip': 'Old Slip',
    'Catherine slip': 'Catherine Slip',
    'Peck slip'
    
    # order
    'Av A': 'A Avenue',
    'Av B': 'B Avenue',
    'Av C': 'C Avenue',
    'av c W': 'C Avenue',
    'Av D': 'D Avenue',
    'av n E': 'East Avenue',
    'av n W': 'West Avenue',
    'W End av Street': 'West Avenue',
    
    # same name
    'Centre Market': 'Centre Market Place',
    'Clinton Market': 'Clinton Place',
    'Chatham Square': 'Chatham Street',
    'Catherine Market': 'Catherine Street',
    'Fulton Market': 'Fulton Street',
    'Grand st Circle': 'Grand Street',
    'Jefferson Market': 'Jefferson Street',
    'Stone William': 'Stone Street',
    'Tompkins Market': 'Tompkins Street',
    'Washington Market': 'Washington Place',
    'Hanover Square': 'Hanover Street',
    'New Chambers': 'New Chambers Street',
    
    # different spelling
    'Macdougal Street': 'Mac Dougal Street',
    'St Mark’s Place': 'Saint Marks Place',
    'VandeWater Street': 'Vandewater Street',
    
    # unmatched
    # Amsterdam Avenue
    # Columbus Avenue
    # Trinity Place
    # Lincoln Avenue
    # Tremont Avenue
    # Brook Avenue
    # Willis Avenue
    # Bible h
    # Jerome Avenue
}

In [29]:
df = df.replace({'street_clean': STREET})
df.head(1)

Unnamed: 0,full,name,address,street,no,street_clean
0,Abbott Charles W. 548 W 39th,Abbott Charles W,548 W 39th,W 39th,548,W 39th Street


In [30]:
streets_list = list(dict(df['street_clean'].value_counts()).keys())
streets_list[:5]

['3rd Avenue', '2nd Avenue', '1st Avenue', 'Broadway', '6th Avenue']

### cross check results

In [31]:
for street in streets_list:
    if street not in master_list:
        print(street)

Broadway W

Columbus Avenue
Amsterdam Avenue
Av A
Broadway E
Lincoln Avenue
Trinity Place
Tremont Avenue
West Avenue
Willis Avenue
Jerome Avenue
Bible h
E 145th Street
East Avenue
Brook Avenue


### House Number

In [32]:
df['num_clean'] = df['no']
df.head(1)

Unnamed: 0,full,name,address,street,no,street_clean,num_clean
0,Abbott Charles W. 548 W 39th,Abbott Charles W,548 W 39th,W 39th,548,W 39th Street,548


In [33]:
no = dict(df['num_clean'].value_counts())
#sorted(no.items())

In [34]:
df['num_clean'] = df['num_clean'].apply(
    lambda x: x.replace('th', '') if str(x).endswith('th') else x)

In [35]:
df['num_clean'] = df['num_clean'].apply(
    lambda x: x.replace('d', '') if str(x).endswith('d') else x)

In [36]:
df['num_clean'] = df['num_clean'].dropna().astype(str).astype(float).astype(int)

In [37]:
df.to_csv('cleaned/1898_address_CLEANED.csv')