In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)

import numpy as np
import re 
import matplotlib.pyplot as plt
import seaborn as sns

from geographiclib.geodesic import Geodesic
geod = Geodesic.WGS84

from scipy.interpolate import interp1d

from glob import glob

import folium

# !pip3 install geopy
from geopy.distance import distance

# !pip3 install iteround
from iteround import saferound

from collections import Counter

def flatten_list(l):
    return [item for sublist in l for item in sublist]

def angleDiff(sourceA,targetA):
    a = targetA - sourceA
    a = (a + 180) % 360 - 180
    return a

def unique_num(x):
    return len(set(x))
    
def add_degree_to_azimuth(current, change):
    assert(abs(change)<180)
    output = current+change
    if output<-180:
        output = 180-(-output-180)
    elif output>180:
        output = -180+(output-180)
    if output == -180:
        output = -output
    return output

def replace_word(x, dic):
    for key,value in dic.items():
        x = x.replace(key,value)
    return x

# !pip3 install fuzzywuzzy
from fuzzywuzzy import fuzz

## Start geocoding

In [2]:
full = pd.read_csv('1880_streets_full.csv')
full['building_num_range'] = full['building_num_range'].apply(lambda x: np.array(eval(x)))
full['start_end_coordinates'] = full['start_end_coordinates'].apply(lambda x: np.array(eval(x)))


streets = full.street_name.drop_duplicates().tolist()

geo = pd.read_csv('Cleaned_manhattan.csv')

In [4]:
full.head()

Unnamed: 0,street_name,odd_on,avg_direction,building_num_range,start_end_coordinates,segment_length,segment_direction,building_num_range_length,avg_length_per_building,direction_deviation,road_type,offset_from_road_center
0,10th avenue,left,9.438109,"[1, 19]","[[40.739952, -74.009472], [40.741073, -74.0092...",126.194091,9.438109,19,6.641794,0.0,Avenue,10
1,10th avenue,left,29.586666,"[20, 43]","[[40.741073, -74.009227], [40.741552, -74.0089...",95.813888,31.457163,24,3.992245,-1.870497,Avenue,10
2,10th avenue,left,29.586666,"[44, 57]","[[40.741552, -74.008912], [40.742288, -74.00832]]",59.475289,26.573321,14,4.248235,3.013345,Avenue,10
3,10th avenue,left,27.73663,"[58, 77]","[[40.742288, -74.00832], [40.742928, -74.007856]]",81.474865,26.604547,20,4.073743,1.132084,Avenue,10
4,10th avenue,left,27.73663,"[78, 95]","[[40.742928, -74.007856], [40.743584, -74.0074...",81.16036,28.873101,18,4.508909,-1.136471,Avenue,10


In [3]:
geo.head()

Unnamed: 0,DataID,RecordId,Name,First Name,Middle Name,Surname,HouseHoldId,Street Name,House Number,Dwelling Number,Own or Rent,Relationship,Gender,Race,Age,Marital Status,Number of Years of Present Marriage,Number of Children Born,Number of Children Living,Birthplace,Father's Birthplace,Mother's Birthplace,US Born,Immigration Year,Naturalization Status,Self Native Tongue,Occupation,Industry,Self Employed or Not,Out of Work on April 15,Number of Weeks Out of Work,Can Read,Can Write,Attended School,Blind,Deaf and Dumb,Enumeration District Number,Ward of City
0,4449800_00006,18491317,August Mohr,August,,Mohr,18491317,Whitehall St,28-3,1.0,Rent,Head,Male,White,24.0,Married,2,,,New York,Germany,New York,True,Inapplicable,Inapplicable,English,Laborer,Produce,Wage Earner,False,6.0,Yes,Yes,No,False,False,1,1
1,4449800_00162,123472341,Fredrick Val Diem,Fredrick,Val,Diem,18494569,Washington Street,3,2.0,,Son,Male,White,1.0,Single,Inapplicable,,,New York,Germany,Hungary,True,Inapplicable,Inapplicable,Unknown,English,,,,,,,,False,False,6,1
2,4449800_00162,18494554,William Hassen,William,,Hassen,18494554,Washington Street,3,2.0,Rent,Head,Male,White,21.0,Married,6,,,Turkey,Turkey,Turkey,False,1904.0,Papers,English,Proprietor,Dry Goods,Own Account,,,Yes,Yes,,False,False,6,1
3,4449800_00162,123472343,Naffie Hassen,Naffie,,Hassen,18494554,Washington Street,3,2.0,,Wife,Female,White,25.0,Married,6,3.0,2.0,Turkey,Turkey,Turkey,False,1904.0,,English,,,,,,Yes,Yes,,False,False,6,1
4,4449800_00162,123472344,Samuel Hassen,Samuel,,Hassen,18494554,Washington Street,3,2.0,,Son,Male,White,4.0,Single,Inapplicable,,,New York,Turkey,Turkey,True,Inapplicable,Inapplicable,English,,,,,,,,,False,False,6,1


In [7]:
# geo['Street Name'].value_counts()[:100]

In [8]:
ordinal_correction_mapping = pd.DataFrame([e.split() for e in '''1st	first
2nd	second
3rd	third
4th	fourth
5th	fifth
6th	sixth
7th	seventh
8th	eighth
9th	ninth
10th	tenth
11th	eleventh
12th	twelfth
13th	thirteenth
14th	fourteenth
15th	fifteenth
16th	sixteenth
17th	seventeenth
18th	eighteenth
19th	nineteenth
20th	twentieth'''.split('\n')]).set_index(1).to_dict()[0]

In [10]:
address_correction_mapping = {' ship':' slip'}
 
address_correction_mapping.update(ordinal_correction_mapping)

direction_mapping = {'west ':'w ','east ':'e ','north ':'n ','south ':'s '}

In [11]:
geo['Street Name New'] = geo['Street Name'].apply(lambda x: replace_word(x.lower(),address_correction_mapping))

geo['Street Name New'] = geo['Street Name New'].apply(lambda x: x.replace(' st',' street') if x.endswith(' st') else x)

geo['Street Name New'] = geo['Street Name New'].apply(lambda x: re.sub(r'[^a-zA-Z0-9\s]+','',x))

In [13]:
raw_streets = geo['Street Name New'].value_counts().index.tolist()

In [35]:
def get_potential_match(this_street_name):

    if this_street_name in streets:
        return this_street_name
    
    if this_street_name+' street' in streets:
        return this_street_name+' street'
    
    prefix = ''
    number_street = False
    if this_street_name.startswith('east ') or this_street_name.startswith('west ') or this_street_name.startswith('e ') or this_street_name.startswith('w '):
        prefix = this_street_name.split()[0][0] + ' '
        x = this_street_name.split(' ',1)[1]
    else:
        x = this_street_name
    if len(re.findall(r'^\d+(?:th|st|nd|rd|) (?:street|st|avenue|ave)$',x))>0:
        number_street = True
        num, road_type = x.split()
        if num.isnumeric():
            if num[-2:] in ['11','12','13']:
                num = num+'th'
            else:
                if num[-1]=='1':
                    num = num+'st'
                elif num[-1]=='2':
                    num = num+'nd'
                elif num[-1]=='3':
                    num = num+'rd'
                else: 
                    num = num+'th'
            this_street_name = prefix + num + ' ' + ('avenue' if road_type == 'ave' else 'street' if road_type == 'st' else road_type)

    
    if this_street_name.startswith('ave ') or this_street_name.startswith('avenue '):
        this_street_name = this_street_name.replace('ave ','').replace('avenue ','')+' avenue'
        
    if this_street_name.endswith(' ave'):
        this_street_name = this_street_name.replace(' ave',' avenue')
    
    if not re.sub(r'\s+','',this_street_name).isalpha():
        this_street_name = replace_word(this_street_name,direction_mapping)
    
    if this_street_name in streets:
        return this_street_name
    else:
        if number_street:
            if 'e ' + this_street_name in streets or 'w ' + this_street_name in streets:
                return this_street_name
    
    
    streets_df = pd.DataFrame(zip(streets),columns = ['street_name'])

    streets_df['match_score'] = -streets_df['street_name'].apply(lambda x: fuzz.partial_ratio(x,this_street_name))

    streets_df = streets_df.sort_values('match_score')

    streets_df['ranking'] = streets_df['match_score'].rank(method='dense')

    potentials = streets_df[streets_df['ranking']==1].street_name.tolist()    
    
    return '?' + ', '.join(potentials)

In [36]:
potential_matched_streets = [get_potential_match(s) for s in raw_streets]

In [37]:
raw_and_potential = pd.DataFrame(zip(raw_streets,potential_matched_streets), columns=['raw','potential'])

In [38]:
raw_and_potential['counts'] = geo['Street Name New'].value_counts().values

In [39]:
matched = raw_and_potential[~raw_and_potential.potential.str.startswith('?')].copy()

auto_matched_mapping = matched.drop('counts',axis=1).set_index('raw').to_dict()['potential']

matched.counts.sum()/raw_and_potential.counts.sum()

0.792794456476298

In [42]:
need_fix = raw_and_potential[(raw_and_potential.potential.str.startswith('?'))&(raw_and_potential.counts>100)].copy()

need_fix.to_csv('need_fix.csv',index=False)

need_fix.counts.sum()/raw_and_potential.counts.sum()

0.1546566416355218

In [44]:
fixed = pd.read_csv('fixed.csv')

manual_fixed_mapping = fixed[fixed.potential!='?'].drop('counts',axis=1).set_index('raw').to_dict()['potential']

auto_matched_mapping.update(manual_fixed_mapping)

In [34]:
# import pickle
# pickle.dump(auto_matched_mapping,open('_auto_matched_mapping.txt','wb'))

In [46]:
geo['Street Name New'] = geo['Street Name New'].apply(lambda x: auto_matched_mapping[x] if x in auto_matched_mapping.keys() else '?'+x)

In [55]:
def get_house_number(x):
    
    x = x.replace(' to ',' - ').replace('.0','')
    x = ''.join([letter for letter in list(x) if not letter.isalpha()]).strip()
    x = re.sub(r'(\d+) \d/\d',r'\1',x)
    x = re.sub(r'\s*(\&|\+|\-|\/)\s*','-',x)
    x = re.sub(r'(\d+)\s+(\d+)',r'\1-\2',x)
    x = x.replace('#','').replace('%','').replace('(','').replace(')','')
    x = re.sub(r'\-$','',x)
    x = x.strip()

    if x=='':
        return np.nan
    
    try:
        num = int(x)
        return num
    except:
        
        try:
        
            num_li = x.split('-')
            start,end = num_li[0],num_li[-1]

            if len(end)>len(start):
                num = np.nan
            elif len(end) == len(start):
                if int(end)<=int(start):
                    num = np.nan
                else:
                    num = (int(end)+int(start))//2
            elif len(end)<len(start):
                start_latter_digits = start[-len(end):]
                if int(end)<=int(start_latter_digits):
                    num = np.nan
                else:        
                    middle = (int(start_latter_digits)+int(end))//2
                    num = int(start[:-len(end)])*(10**len(start_latter_digits)) + middle
        
            return num
        
        except:
            
            return x

In [56]:
geo['House Number New'] = geo['House Number'].apply(get_house_number)

In [57]:
assert(geo['House Number New'].apply(lambda x: isinstance(x,str)).sum()==0)

In [58]:
geo['House Number New'].isnull().sum()

16186

In [276]:
street_name_to_max_building_num = full.groupby('street_name')['building_num_range'].apply(list).apply(lambda li: max([t[-1] for t in li])).to_dict()
def ensure_building_num_valid(street_name, building_num):
    if np.isnan(building_num):
        return building_num
    upper_bound = street_name_to_max_building_num[street_name] if street_name in street_name_to_max_building_num.keys() else 999
    if building_num>upper_bound:
        building_num_string = str(int(building_num))
        
        half_length = len(building_num_string)//2
        start = int(building_num_string[:half_length])
        end = int(building_num_string[half_length:])
        middle = (start+end)//2
        
        if start<end and middle<=upper_bound:
            return middle
        else:
            return np.nan
    else:
        return building_num
    

In [288]:

geo.loc[geo['House Number New'].notnull(),'House Number New'] = geo.loc[geo['House Number New'].notnull(),['House Number New','Street Name New']].apply(lambda row: ensure_building_num_valid(row['Street Name New'], row['House Number New']), axis=1)


In [289]:
geo['House Number New'].isnull().sum()

25958

In [304]:
geo['Street Name New'] = geo['Street Name New'].apply(lambda x: np.nan if x.startswith('?') else x)

In [307]:
geo['Address'] = np.nan

In [310]:
geo.loc[(geo['Street Name New'].notnull()) & (geo['House Number New'].notnull()), 'Address'] = geo.loc[(geo['Street Name New'].notnull()) & (geo['House Number New'].notnull()),['Street Name New','House Number New']].apply(lambda row: row['Street Name New'] + ' ' + str(int(row['House Number New'])), axis=1)

In [315]:
geo.to_csv('_Manhattan_with_address.csv',index=False)

### E & W of a street should be searchable with street name

In [331]:
temp = full[full.street_name.str.endswith('houston street')].copy().reset_index(drop=True)
temp[23:27]

Unnamed: 0,street_name,odd_on,avg_direction,building_num_range,start_end_coordinates,segment_length,segment_direction,building_num_range_length,avg_length_per_building,direction_deviation,road_type,offset_from_road_center
23,e houston street,right,107.04583,"[465, 472]","[[40.719946, -73.977607], [40.719796, -73.9769...",56.823608,107.04583,8,7.102951,0.0,Street,5
24,e houston street,right,108.392973,"[473, 512]","[[40.719796, -73.976964], [40.719522, -73.9758...",96.429708,108.392973,40,2.410743,0.0,Street,5
25,w houston street,left,-56.415274,"[1, 22]","[[40.725984, -73.99664], [40.726384, -73.997456]]",78.550051,-53.976604,22,3.570457,-2.43867,Street,5
26,w houston street,left,-56.415274,"[23, 43]","[[40.726384, -73.997456], [40.726768, -73.9981...",80.139322,-59.34019,21,3.816158,2.924916,Street,5


In [333]:
### Visual inspect

street_points_list = temp.start_end_coordinates.tolist()

all_streets_center_loc = np.mean([np.mean(li, axis=0).tolist() for li in street_points_list],axis=0)

my_map = folium.Map(location=all_streets_center_loc,
                        zoom_start= 14 ,
                        tiles="CartoDB dark_matter")

for street_points in street_points_list[:25]:
    folium.PolyLine(street_points, color="red", weight=1).add_to(my_map)
for street_points in street_points_list[25:]:
    folium.PolyLine(street_points, color="blue", weight=1).add_to(my_map)

my_map

In [336]:
geo = pd.read_csv('Manhattan_with_address.csv')

In [337]:
geo = geo[['RecordId','Street Name','House Number','Street Name New','House Number New','Address','Enumeration District Number','Ward of City']]

geo = geo.drop_duplicates(subset=geo.columns[1:].tolist(), keep='last')[geo.columns[1:]].reset_index()

geo['Number of Residents'] = geo['index'].diff().fillna(geo['index'][0]).apply(int)

geo = geo.drop('index',axis=1)

In [338]:
# print(', '.join(sorted(geo['Street Name New'].drop_duplicates().tolist())))
print()
print(f"{np.mean(geo['Street Name New'].isin(streets))*100:.1f}% of the streets in geo are covered in our 1880 address database.")
print()


78.3% of the streets in geo are covered in our 1880 address database.



In [339]:
addresses = sorted(geo.dropna().Address.drop_duplicates().tolist())

In [340]:
def get_coordinates_from_details(target_building_num,building_num_range,start_end_coordinates,segment_direction,odd_on,offset_from_road_center):
    
    is_odd = target_building_num%2==1
    
    if (building_num_range[1] - building_num_range[0])==0:
        street_center_position = np.mean(start_end_coordinates,axis=0).tolist()
    else:
        f_pt_proportion = (target_building_num - building_num_range[0])/(building_num_range[1] - building_num_range[0])
        t_pt_proportion = 1 - f_pt_proportion
        street_center_position = np.average(np.array(start_end_coordinates), weights = (f_pt_proportion,t_pt_proportion), axis=0).tolist()

    offset_direction = add_degree_to_azimuth(segment_direction,-90) if ((odd_on=='left' and is_odd) or (odd_on=='right' and not is_odd))  else add_degree_to_azimuth(segment_direction,90)

    target_position = geod.Direct(*street_center_position, offset_direction, offset_from_road_center)
    target_point = list([target_position['lat2'],target_position['lon2']])

    return target_point

In [341]:
def get_addr_coordinates(addr):
    target_street_name, target_building_num = ' '.join(addr.split()[:-1]), addr.split()[-1]
    target_building_num = int(target_building_num)
    matched = full[full.street_name==target_street_name].copy().apply(lambda row: get_coordinates_from_details(target_building_num, row['building_num_range'],row['start_end_coordinates'],row['segment_direction'],row['odd_on'],row['offset_from_road_center']) if target_building_num>= row['building_num_range'][0] and target_building_num <= row['building_num_range'][1] else np.nan, axis=1).dropna()
    if len(matched)>0:
        return tuple(matched.tolist()[0])
    return (np.nan,np.nan)

In [342]:
addresses_coordinates = []
for addr in addresses:
    addresses_coordinates.append(get_addr_coordinates(addr))

addr_to_coordinates = pd.DataFrame(zip(addresses,addresses_coordinates),columns=['Address','Coordinates'])

In [393]:
new_geo = pd.merge(geo,addr_to_coordinates)
new_geo['Lat'],new_geo['Lon'] = zip(*new_geo['Coordinates'])

In [396]:
print()
print(f"{np.round(new_geo['Lat'].isnull().sum()/len(new_geo),3)*100:.1f}% of the address are not found.")
print(f"{(new_geo.drop_duplicates(subset=['Address'])['Lat'].value_counts()[new_geo.drop_duplicates(subset=['Address'])['Lat'].value_counts()>1].sum()/len(geo.drop_duplicates(subset=['Address'])))*100:.1f}% of the addresses have the same coordinate with somewhere else.\n")


10.7% of the address are not found.
0.3% of the addresses have the same coordinate with somewhere else.



In [411]:
df = pd.read_csv('Manhattan_with_address.csv')
geo_info_df = new_geo[['Address','Lat','Lon']].dropna().drop_duplicates()
df = pd.merge(df,geo_info_df,how='left')
assert(df.RecordId.nunique() == len(df))
print(f"{df.Lat.notnull().sum()/len(df)*100:.1f}% of the entries are geocoded.")

80.4% of the entries are geocoded.


In [415]:
valid_new_geo = new_geo[new_geo.Coordinates!=(np.nan,np.nan)].copy()

max_num_of_res = valid_new_geo['Number of Residents'].max()

list_of_points = valid_new_geo.Coordinates.tolist()

residents_at_points = valid_new_geo['Number of Residents'].tolist()

max_r = (0.5+np.log(max(residents_at_points))**1.2)

center_loc = np.mean( list_of_points ,axis=0).tolist()

my_map = folium.Map(location=center_loc,zoom_start=15)# ,tiles="CartoDB dark_matter")


plot_streets = valid_new_geo['Street Name New'].drop_duplicates().tolist()
for s in plot_streets:

    line_weight = 4 if s.endswith('Avenue') or s.endswith('Broadway') else 2
    street_points = full[full.street_name==s].start_end_coordinates.tolist()
    folium.PolyLine(street_points, color='dimgrey', weight=line_weight, alpha = 0.1, popup=s).add_to(my_map)
    

for i in range(len(list_of_points)):
    
    pt = list_of_points[i]
    num_of_res = residents_at_points[i]
        
    r = 0.5+np.log(num_of_res)**1.2
    
    color_ratio = r/max_r
    rgb_tuple = (int(255/2 + 255/2*color_ratio),int(0),int(255/2 - 255/2*color_ratio))
    hex_code = '#%02x%02x%02x' % rgb_tuple
    
    folium.Circle(location=pt,radius=r,color=hex_code,alpha=0.3 ).add_to(my_map)

# my_map

In [419]:
# my_map

In [418]:
my_map.save('large_demo.html')

In [420]:
df.to_csv('Geocoded_Manhattan.csv',index=False)

#### Point order problem

In [None]:
# this_street = '10th Avenue'
# full[full.street_name==this_street]

In [None]:
# full[full.street_name==this_street]['start_end_coordinates'].apply(lambda li: li[0][0]).plot()

In [None]:
# full[full.street_name==this_street]['start_end_coordinates'].apply(lambda li: li[0][1]).plot()

In [None]:
# full[full.street_name==this_street].segment_direction.plot()

In [None]:
# street_points_list = full[full.street_name==this_street].start_end_coordinates.tolist() 

# ### Visual inspect

# colors = ['red','orange','green','blue','purple','white','brown']

# my_map = folium.Map(location=np.mean(flatten_list(street_points_list),axis=0).tolist(),
#                         zoom_start= 17 ,
#                         tiles="CartoDB dark_matter")

# for i in range(len(street_points_list)):
#     street_points = street_points_list[i]
#     # print(i,street_points)
#     folium.PolyLine(street_points, color=colors[i%len(colors)], weight=(1+i%len(colors))).add_to(my_map)

# #[[40.710512, -73.992112], [40.710864, -73.99232], [40.711312, -73.992416]]
    
# my_map

In [None]:
# street_points_list = raw[raw['Name']==this_street.split()[0]].sort_values('L_f_add').reset_index(drop=True).coordinates.tolist()


# ### Visual inspect

# center = np.mean(flatten_list(street_points_list),axis=0).tolist()

# colors = ['red','orange','green','blue','purple','white','brown']

# my_map = folium.Map(location=center,
#                         zoom_start= 17 ,
#                         tiles="CartoDB dark_matter")

# for i in range(len(street_points_list)):
#     street_points = street_points_list[i]
#     # print(i,street_points)
#     folium.PolyLine(street_points, color=colors[i%len(colors)], weight=(1+i%len(colors))).add_to(my_map)

# my_map

**END** Point order problem <br><br><br><br><br>

#### Old Geo

In [None]:
## old_geo = pd.read_csv('/Users/timsmac/Desktop/us_census_data/aggregate/Geoloaded_1910_New York_Manhattan Ward 7.csv')

## old_geo = old_geo[['Street Name','House Number','Street Name New', 'House Number New', 'Address', 'Lon', 'Lat', 'Enumeration District Number', 'Ward of City']]

## old_geo = old_geo.drop_duplicates(['Street Name','House Number'])

## old_geo.to_csv('_old_geo_for_comparison.csv',index=False)

**END** Old Geo<br><br><br><br><br>

In [None]:
# li = [street_center_position,target_point]

# my_map = folium.Map(location=np.mean(li,axis=0).tolist(), zoom_start= 20, tiles="CartoDB dark_matter")

# for i in range(len(li)):
#     pt = li[i]
#     folium.Circle(pt, radius=2,color=colors[i%len(colors)]).add_to(my_map)
    
# my_map

In [None]:
# # # Check and verify that there is overlapping building num range problem

# data = df.copy()

# data['building_num_range'] = df[['min_building_num','max_building_num']].apply(lambda row: list(range(row['min_building_num'],row['max_building_num'])),axis=1)

# overlapping_range_detection = data.groupby('street_name').agg({'building_num_range':list})

# overlapping_range_detection['building_num_range'] = overlapping_range_detection['building_num_range'].apply(flatten_list)

# overlapping_range_detection['building_num_range'] = overlapping_range_detection['building_num_range'].apply(lambda li: Counter(li))

# overlapping_range_detection['building_num_range'] = overlapping_range_detection['building_num_range'].apply(lambda counter: [k for k, v in counter.items() if v > 1])

# overlapping_range_detection['building_num_range'].apply(len).value_counts()

# overlapping_range_detection['building_num_range'][overlapping_range_detection['building_num_range'].apply(len)>0][:100]

In [None]:
# # # Fix max_building_num

# non_overlapping_range_df = pd.DataFrame()

# for street_name in df.street_name.unique().tolist():
    
#     data = df[df.street_name==street_name]

#     data['next_segment_min_building_num'] = data.min_building_num.shift(-1)

#     data['max_building_num'] = data.apply(lambda row: row['max_building_num'] if np.isnan(row['next_segment_min_building_num']) else row['max_building_num'] if row['max_building_num']<row['next_segment_min_building_num'] else row['next_segment_min_building_num']-1  , axis=1).apply(int)

#     data = data.drop('next_segment_min_building_num',axis=1)
    
#     non_overlapping_range_df = non_overlapping_range_df.append(data, ignore_index=True)
    

In [None]:
# ### Visual inspect

# street_points_list = df.coordinates.tolist()

# all_streets_center_loc = np.mean([np.mean(li, axis=0).tolist() for li in street_points_list],axis=0)

# my_map = folium.Map(location=all_streets_center_loc,
#                         zoom_start= 13 ,
#                         tiles="CartoDB dark_matter")

# for street_points in street_points_list:

#     folium.PolyLine(street_points, color="red", weight=1).add_to(my_map)

# my_map

In [None]:
# glob('1910MAN/*')

# df = pd.read_csv('1910MAN/1910MANstCLN_W7.csv')

# geo_relevant_cols = ['DataID', 'RecordId', 'HouseHoldId', 'Street Name', 'St_MOD',
#        'ADDY_CLN', 'ZIP', 'City', 'State', 'Country', 'House Number', 'HoNo_1',
#        'Dwelling Number','Enumeration District Number', 'Ward of City']

# df = df[geo_relevant_cols]

# df.ZIP.value_counts()

# df.RecordId.nunique()

# found = pd.read_csv('1910MAN/1910MANstCLN_W7_found.csv')[['RecordId','Longitude', 'Latitude', 'Side']]

# found

# found.Side.value_counts()

# fdf = df[df.RecordId.isin( set(found.RecordId.tolist()) )]

# fdf['House Number'].value_counts()

# fdf.St_MOD.value_counts()

# ndf = df[~df.RecordId.isin( set(found.RecordId.tolist()) )]

# ndf['House Number'].value_counts()

# ndf.St_MOD.value_counts()

In [None]:
# address = 'Macombs Place 16'

# street_name = ' '.join(address.split()[:-1])
# building_number = address.split()[-1]

# # np.cumsum(segment_lengths)