In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import re 
import matplotlib.pyplot as plt
import seaborn as sns

from scipy.interpolate import interp1d

from glob import glob

import folium

# !pip3 install geopy
from geopy.distance import distance

pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)

# !pip3 install iteround
from iteround import saferound

from collections import Counter

def flatten_list(l):
    return [item for sublist in l for item in sublist]

def pairwise(iterable):
    it = iter(iterable)
    a = next(it, None)
    for b in it:
        yield [a, b]
        a = b
        
def pairwise_list(li, endpoint_unique=False, include_end=False):
    li = list(pairwise(li))
    if include_end:
        for i in range(1,len(li)):
            li[i][0] = li[i][0]+1
    elif endpoint_unique:
        for i in range(0,len(li)-1):
            li[i][1] = li[i][1]-1
    return li


def angleDiff(sourceA,targetA):
    a = targetA - sourceA
    a = (a + 180) % 360 - 180
    return a

def unique_num(x):
    return len(set(x))

def add_degree_to_azimuth(current, change):
    assert(abs(change)<180)
    output = current+change
    if output<-180:
        output = 180-(-output-180)
    elif output>180:
        output = -180+(output-180)
    if output == -180:
        output = -output
    return output
    
import geojson
with open('geoj.geojson') as f:
    gj = geojson.load(f)

# print(gj.keys())
# print(gj['type'],gj['name'],gj['crs'])


FileNotFoundError: [Errno 2] No such file or directory: 'geoj.geojson'

In [None]:
length_to_map_zoom_mapping = interp1d([0,100,15000],[20,16,10])

def inspect_street(street_name, return_df = False):
    
    try:

        part = df[df.street_name==street_name]
        part['length'] = part['segment_lengths'].apply(sum)
        street_total_length = part.length.sum()

        street_points_list = part.coordinates.tolist()

        all_streets_center_loc = np.mean([np.mean(li, axis=0).tolist() for li in street_points_list],axis=0)

        my_map = folium.Map(location=all_streets_center_loc,
                                zoom_start= float(length_to_map_zoom_mapping(street_total_length)),
                                tiles="CartoDB dark_matter")

        for street_points in street_points_list:

            folium.PolyLine(street_points, color="red", weight=1).add_to(my_map)

        if return_df:
            return my_map, part

        return my_map

    except:
        
        print('Street Not Found: "'+street_name+'"')

In [None]:
# Get the data for a segment of street from the raw GeoJson file
street_rows = []
for shape in gj['features']:
    properties_temp = shape['properties']
    coordinates = shape['geometry']['coordinates'][0]
    coordinates = np.flip(np.array(coordinates)).tolist()
    properties_temp.update({'coordinates':coordinates})
    street_rows.append(properties_temp)

In [None]:
# Create a pandas dataframe containing the raw data

df = pd.DataFrame(street_rows)

df = df.applymap(lambda x: np.nan if x==None or x=='no name' else x)

df.drop('Shape_Leng',axis=1,inplace=True)

In [None]:
df.info()

In [None]:
# L_f_add is the building number on the Front of the Left side of this street segment
# R_t_add is the building number on the Tail of the Right side of this street segment
# The first four fields are like these

# Prefix and Suffix are N S E W direction, 
# Pretype and Type are street types,
# Name and type collectively define the address, but sometimes it disagrees with the Full_name
df

In [None]:
raw = df.copy()

In [None]:
# Find the most frequent values for the first four fields, and see their patterns, this is how I guessed the meaning of these fields

top_k = 10
top_starts = pd.concat([df[col].value_counts()[:top_k].reset_index()['index'] for col in df.columns[:4]], axis=1)
top_starts.columns = df.columns[:4]
top_starts

### Filter out unreliable entries

In [None]:
df.head()

In [None]:
# Filter out: Building number on the L and R side change in different order/direction
# It's okay if building number grows on both L and R sides, or if building number shrinks on both L and R sides
criteria_1 = ((np.sign(df['L_t_add']-df['L_f_add'])*np.sign(df['R_t_add']-df['R_f_add']))>=0)

# Filter out: All building number start/end is zero
criteria_2 = (df['L_t_add']+df['L_f_add']+df['R_t_add']+df['R_f_add']>0)

# Filter out: Starts and ends of L and R differs too much
criteria_3 = (~((np.abs(df['L_f_add']-df['R_f_add'])>30)|(np.abs(df['L_t_add']-df['R_t_add'])>30))&(df['L_t_add']*df['L_f_add']*df['R_t_add']*df['R_f_add']>0))

df = df[ criteria_1 & criteria_2 & criteria_3 ].reset_index(drop=True)

In [None]:
# (0,0)     (10,10)  (15,15)
# 1/2         4        5/6
# 1-----------4---------6

# 10 = 0 + 4/6*(15-0)

In [None]:
df['min_building_num'] = df[df.columns[:4]].apply(lambda row: min([x for x in row.tolist() if x>0]), axis=1)
df['max_building_num'] = df[df.columns[:4]].apply(lambda row: max([x for x in row.tolist() if x>0]), axis=1)

In [None]:
# 1                   99
# 2        50         100  

# 50/(100-1) ~= 0.5

In [None]:
# 1 3 5 7 9
# 2 4 6 8 10

# 2 4 6 8 10
# 1 3 5 7 9

#         7?
# 1----------10
#         7?

In [None]:
df['odd_on'] = df['L_f_add'].apply(lambda x: 'left' if x%2==1 else 'right')

df.odd_on.value_counts()

In [None]:
df['Type'] = df[['Pre_type','Type']].apply(lambda row: row['Type'] if not isinstance(row['Pre_type'],str) else row['Pre_type'] if not isinstance(row['Type'],str) or row['Type'] in row['Pre_type'] else row['Type'] if row['Pre_type'] in row['Type'] else row['Pre_type']+' '+row['Type'], axis=1).fillna('').apply(str.strip)

Type_mapping = {'St': 'Street', 'Ave': 'Avenue', 'Pl': 'Place', 'Ln': 'Lane', 'Aly': 'Alley', 'Sq': 'Square', 'Rd': 'Road', 'Park': 'Park', 'Dr': 'Driveway', 'Sl': 'Slip', 'Row': 'Row', 'Ter': 'Terrace', 'Ct': 'Street', 'Wy': 'Way'}

df['Type'] = df['Type'].apply(lambda x: Type_mapping[x] if x in Type_mapping.keys() else x)

Type_mapping_for_full_name = {' St': ' Street', ' Ave': ' Avenue', ' Pl': ' Place', ' Ln': ' Lane', ' Aly': ' Alley', ' Sq': ' Square', ' Rd': ' Road', ' Park': ' Park', ' Dr': ' Driveway', ' Sl': ' Slip', ' Row': ' Row', ' Ter': ' Terrace', ' Ct': ' Street', ' Wy': ' Way'}



In [None]:
def replace_if_endswith(x,dic = Type_mapping_for_full_name):
    if isinstance(x,str):
        for k in Type_mapping_for_full_name.keys():
            if x.endswith(k):
                x = x.replace(k, Type_mapping_for_full_name[k])
                break
    return x

In [None]:
df['Full_name'] = df['Full_name'].apply(replace_if_endswith)

### About Prefix and Suffix

In [None]:
assert(len(df[(df.Prefix.notnull())&(df.Suffix.isnull())])==0)

# There is no case where Prefix is available while Suffix is missing, in other words, Suffix is always the more complete source

# df[(df.Prefix.isnull())&(df.Suffix.notnull())]
pd.concat([df.Prefix.value_counts(),df.Suffix.value_counts()], axis=1)

In [None]:
df['street_name_contains_number'] = df['Name'].apply(lambda x: len(re.findall('\d+',x))>0 if isinstance(x,str) else False)

In [None]:
df['Suffix'] = df['Suffix'].fillna('')
df['Type'] = df['Type'].fillna('')

In [None]:
# If Name field is not available, use Full_name
# Else If Street Name contains number, then put direction indicator (NSWE) in the front (E 14th Street), otherwise put it in the end (Washington Park S).

df['street_name'] = df.apply(lambda row: row['Full_name'] if not isinstance(row['Name'],str) else row['Suffix']+' '+row['Name']+' '+row['Type'] if row['street_name_contains_number'] else row['Name']+' '+row['Type']+' '+row['Suffix'] ,axis=1)


In [None]:
# For the streets that building numbers are in opposite orders, reverse the coordinates' order, keep rest the same
# VERY TRICKY, BUT IT WORKS, CONTACT TIM IF YOU ARE INTERESTED

df.loc[df['L_f_add']>df['L_t_add'],'coordinates'] = df.loc[df['L_f_add']>df['L_t_add'],'coordinates'].apply(lambda li: li[::-1])

df = df.drop(['L_f_add', 'L_t_add', 'R_f_add', 'R_t_add', 'Prefix', 'Pre_type','Name', 'Suffix', 'Full_name', 'City', 'State', 'Type'], axis=1)

In [None]:
# Remove blank space at rears

df.street_name = df.street_name.apply(str.strip)

In [None]:
from geographiclib.geodesic import Geodesic
geod = Geodesic.WGS84
# object for calculating distance and direction

In [None]:
# Generate information of a street segment based on coordinates

def process_list_of_points(list_of_points):

    segment_lengths = []
    directions = []
    for i in range(len(list_of_points)-1):
        f_pt = list_of_points[i]
        t_pt = list_of_points[i+1]
        segment_lengths.append(distance(f_pt,t_pt).m)
        directions.append(geod.Inverse(*t_pt, *f_pt)['azi1'])

    returning_segment_lengths = segment_lengths[:]
    returning_directions = directions[:]
    
    segment_lengths = np.array(segment_lengths)
    directions = np.array(directions)
    
    mu,sig = np.median(directions),np.std(directions)
    non_outlier_mask = (directions < mu+2*sig)&(directions > mu-2*sig)
    if sum(non_outlier_mask)>0:
        directions = directions[non_outlier_mask].copy()
        segment_lengths = segment_lengths[non_outlier_mask].copy()
    
    weighted_average_direction = np.average(directions,weights=segment_lengths)

    return returning_segment_lengths,returning_directions,weighted_average_direction

In [None]:
df['segment_lengths'],df['directions'],df['weighted_avg_direction'] = zip(*df['coordinates'].apply(process_list_of_points))

In [None]:
df.head()

In [None]:
df = df.sort_values(['street_name','min_building_num']).reset_index(drop=True)

In [None]:
# Some street segments overlap with each other, we remove all street segments that have such conflicts
# Fix min_building_num

non_overlapping_range_df = pd.DataFrame()

for street_name in df.street_name.unique().tolist():
    
    data = df[df.street_name==street_name]

    data['prev_segment_max_building_num'] = data.max_building_num.shift(1)

    data['min_building_num'] = data.apply(lambda row: row['min_building_num'] if np.isnan(row['prev_segment_max_building_num']) else row['min_building_num'] if row['min_building_num']>row['prev_segment_max_building_num'] else row['prev_segment_max_building_num']+1  , axis=1).apply(int)

    data = data.drop('prev_segment_max_building_num',axis=1)
    
    non_overlapping_range_df = non_overlapping_range_df.append(data, ignore_index=True)
    
# print(sum(non_overlapping_range_df.max_building_num > non_overlapping_range_df.min_building_num))
non_overlapping_range_df = non_overlapping_range_df[non_overlapping_range_df.max_building_num > non_overlapping_range_df.min_building_num].reset_index(drop=True)


In [None]:
# # # Now there is no overlapping building num range problem

data = non_overlapping_range_df.copy()

data['building_num_range'] = data[['min_building_num','max_building_num']].apply(lambda row: list(range(row['min_building_num'],row['max_building_num'])),axis=1)

overlapping_range_detection = data.groupby('street_name').agg({'building_num_range':list})

overlapping_range_detection['building_num_range'] = overlapping_range_detection['building_num_range'].apply(flatten_list)

overlapping_range_detection['building_num_range'] = overlapping_range_detection['building_num_range'].apply(lambda li: Counter(li))

overlapping_range_detection['building_num_range'] = overlapping_range_detection['building_num_range'].apply(lambda counter: [k for k, v in counter.items() if v > 1])

overlapping_range_detection['building_num_range'].apply(len).value_counts()

assert(len( overlapping_range_detection['building_num_range'][overlapping_range_detection['building_num_range'].apply(len)>0] )==0) 

# # # Check satisfied

df = non_overlapping_range_df.copy()

In [None]:
df['building_num_range_length'] = df['max_building_num'] - df['min_building_num']

df['max_building_num_end_points'] = (df.segment_lengths.apply(lambda li: np.array(li)/np.sum(li))*df['building_num_range_length']).apply(lambda li: saferound(li, places=0)).apply(lambda li: np.cumsum(li).astype(np.int0))  +  df['min_building_num']

df['building_num_points'] = df.apply(lambda row: [row['min_building_num']] + row['max_building_num_end_points'].tolist(), axis=1)

df = df.drop('max_building_num_end_points',axis=1)

assert(sum(df.building_num_points.apply(len) != df.coordinates.apply(len)) == 0)

In [None]:
# min_building_num, max_building_num:[20, 57]
# building_num_points:[20, 43, 57]

In [None]:
df.head()

In [None]:
# For street parts that have more than one segment, the endpoints are not listed in the same order as other parts. 
# For example, the last (or southeast end) of Pike street has two shape segments, and that part's points are listed in reverse order/direction compared to the other street parts. 
# After some exploration, it turns out that we need to reverse the list before using pairwise_list, so as to reflect the order of the longer sequence

df['coordinates_segments'] = df['coordinates'].apply(lambda li: li[::-1]).apply(pairwise_list)

In [None]:
# coordinates:[point1, point2, point3] 
# coordinates_segments:[[point1, point2],   [point2, point3]]

In [None]:
df.head()

In [None]:
df['building_num_segments'] = df['building_num_points'].apply(lambda li: pairwise_list(li, endpoint_unique=True, include_end=True))

In [None]:
# building_num_points:[20, 43, 57] 
# building_num_segments:[[20, 43],   [44, 57]]

In [None]:
df.head()

In [None]:
full = pd.DataFrame(zip(np.repeat(  df.street_name.values, df['coordinates_segments'].apply(len)  ),
np.repeat(  df.odd_on.values, df['coordinates_segments'].apply(len)  ),
np.repeat(  df.weighted_avg_direction.values, df['coordinates_segments'].apply(len)  ),
flatten_list((df.building_num_segments.tolist())),
flatten_list(df.coordinates_segments.tolist()),
flatten_list((df.segment_lengths.tolist())),
flatten_list((df.directions.tolist()))), columns = ['street_name','odd_on','avg_direction','building_num_range','start_end_coordinates','segment_length','segment_direction'])

In [None]:
full['building_num_range_length'] = full['building_num_range'].apply(lambda x: x[1]-x[0]+1)
# print(sum(full['building_num_range_length']<=0))

full = full[full['building_num_range_length']>0].reset_index(drop=True)

full['avg_length_per_building'] = full['segment_length']/full['building_num_range_length']

full['direction_deviation'] = full[['segment_direction','avg_direction']].apply(lambda row: angleDiff(row['segment_direction'],row['avg_direction']),axis=1)

In [None]:
# -------------------100m------------------#
#  H       H    H           H        H     #
# On average, each House occupies 100/5=20 meters

plt.hist(full.avg_length_per_building,bins=60)
avg_length_per_building_thres = np.percentile(full.avg_length_per_building,99)
print(f'The buildings on the 99% of the streets occupy fewer than {avg_length_per_building_thres:.0f} meters.')

In [None]:
# Remove houses that are 3 times as wise as the 99% threshold
full = full[full.avg_length_per_building<avg_length_per_building_thres * 3].reset_index(drop=True)

In [None]:
# Different segments of a street should put odd building numbers on the same side
# For the anomaly, we inspect them and decide how to fix them

left_right_inconsistent_correction = full.groupby('street_name').agg({'odd_on':Counter})['odd_on'][full.groupby('street_name').agg({'odd_on':Counter})['odd_on'].apply(len)>1]
left_right_inconsistent_correction.reset_index()

In [None]:
# If one kind of labels are ten times as many as the other one, we think the other label is a mistake in entry or our preprocessing
# by this criteria, we will set "1st Avenue" to be all left, "6th Avenue" to be all left.

full.loc[(full.street_name=='1st Avenue') | (full.street_name=='6th Avenue'), 'odd_on'] = 'left'

In [None]:
#-------------------------------------------------------------#
    # distance from center to one side = 5 meters
# Street

#-------------------------------------------------------------#




#-------------------------------------------------------------#

    # distance from center to one side = 10 meters

# Avenue



#-------------------------------------------------------------#



# Once we know the location of center of the street, then SHIFT
#      --- to the right or the left (depending on "odd_on" and the building number's odd/even status)
#      --- by ? meters depending which type of road it is


valid_road_type = {'Way', 'Street', 'Driveway', 'River', 'Square', 'Place', 'Front', 'Terrace', 'Slip', 'Island', 'Row', 'Alley', 'Road', 'Pier', 'Lane', 'Broadway', 'Park', 'Avenue'}

full['road_type'] = full.street_name.apply(lambda x: re.sub('\s+[N|E|S|W]$','',re.sub('\s+\d+','',x))).apply(lambda x: x.split()[-1] if x.split()[-1] in valid_road_type else 'Generic')

full['offset_from_road_center'] = full['road_type'].apply(lambda x: 10 if x in ['Avenue','Broadway'] else 5)

In [None]:
full.street_name = full.street_name.apply(lambda x: ' '.join(x.strip().split()))

# Fix after manual inspection
full.street_name = full.street_name.apply(lambda x: 'East Broadway' if x=='Broadway E' else 'West Broadway' if x=='Broadway W' else x)


In [None]:
streets = full.street_name.drop_duplicates().tolist()

In [None]:
# Voila!
# full.to_csv('_1880_streets_full.csv',index=False)

In [None]:
### Visualization # FOLIUM

colors = ['red','orange','green','blue','purple']

center_loc = np.mean(flatten_list(full.start_end_coordinates.tolist()),axis=0).tolist()

my_map = folium.Map(location=center_loc,zoom_start= 13,tiles="CartoDB dark_matter")


# [ '6th Avenue', '7th Avenue']
# [s for s in full.street_name.drop_duplicates().tolist() if s.endswith(' Avenue')]

for i in range(len(streets)):
    
    street_name = streets[i]
    
    street_points = flatten_list(full[full.street_name == street_name].start_end_coordinates.tolist())

    folium.PolyLine(street_points, color=colors[i%len(colors)], weight=1).add_to(my_map)
    
#     for pt in street_points:
#         folium.Marker(location=pt,popup='%f, %f' % tuple(pt)).add_to(my_map)

# del my_map

## Start geocoding

In [None]:
geo = pd.read_csv('datasets/1910_New York_Manhattan Ward 7.csv')

geo = geo[['RecordId','Street Name','House Number','Street Name New','House Number New','Address','Lat','Lon','Enumeration District Number','Ward of City']]

geo['Street Name New'] = geo['Street Name New'].apply(lambda x: 'Rutgers Street' if x.startswith('Rutgers') else x)

geo = geo.drop_duplicates(subset=geo.columns[1:].tolist(), keep='last')[geo.columns[1:]].reset_index()

geo['Number of Residents'] = geo['index'].diff().fillna(geo['index'][0]).apply(int)

geo = geo.drop('index',axis=1)

In [None]:
print(', '.join(sorted(geo['Street Name New'].drop_duplicates().tolist())))
print()
print(f"{np.mean(geo['Street Name New'].isin(streets))*100:.1f}% of the streets in geo are covered in our 1880 address database.")
print()

In [None]:
addresses = sorted(geo.Address.drop_duplicates().tolist())

In [None]:
def get_coordinates_from_details(target_building_num,building_num_range,start_end_coordinates,segment_direction,odd_on,offset_from_road_center):
    
    is_odd = target_building_num%2==1
    
    if (building_num_range[1] - building_num_range[0])==0:
        street_center_position = np.mean(start_end_coordinates,axis=0).tolist()
    else:
        f_pt_proportion = (target_building_num - building_num_range[0])/(building_num_range[1] - building_num_range[0])
        t_pt_proportion = 1 - f_pt_proportion
        street_center_position = np.average(np.array(start_end_coordinates), weights = (f_pt_proportion,t_pt_proportion), axis=0).tolist()

    offset_direction = add_degree_to_azimuth(segment_direction,-90) if ((odd_on=='left' and is_odd) or (odd_on=='right' and not is_odd))  else add_degree_to_azimuth(segment_direction,90)

    target_position = geod.Direct(*street_center_position, offset_direction, offset_from_road_center)
    target_point = list([target_position['lat2'],target_position['lon2']])

    return target_point

In [None]:
# Given an address, first choose the relevant data based on street name
# Then find which segment of the street is relevant baesd on the building number
# After that, calculate the proportional location based on the building number and the start and end building numbers of the segment
# Calculate the geolocation of the center of street at that location
# Shift the location to one side by a proper distance
# Return the final coordinates

def get_addr_coordinates(addr):
    target_street_name, target_building_num = ' '.join(addr.split()[:-1]), addr.split()[-1]
    target_building_num = int(target_building_num)
    matched = full[full.street_name==target_street_name].copy().apply(lambda row: get_coordinates_from_details(target_building_num, row['building_num_range'],row['start_end_coordinates'],row['segment_direction'],row['odd_on'],row['offset_from_road_center']) if target_building_num>= row['building_num_range'][0] and target_building_num <= row['building_num_range'][1] else np.nan, axis=1).dropna()
    if len(matched)>0:
        return tuple(matched.tolist()[0])
    return (np.nan,np.nan)

In [None]:
addresses_coordinates = []
for addr in addresses:
    addresses_coordinates.append(get_addr_coordinates(addr))

addr_to_coordinates = pd.DataFrame(zip(addresses,addresses_coordinates),columns=['Address','Coordinates'])

In [None]:
new_geo = pd.merge(geo,addr_to_coordinates)

new_geo = new_geo.rename(columns = {'Lat':'API Lat','Lon':'API Lon'})

new_geo['GIS Lat'],new_geo['GIS Lon'] = zip(*new_geo['Coordinates'])

In [None]:
print(f"\nPreviously, {(new_geo.drop_duplicates(subset=['Address'])['API Lat'].value_counts()[new_geo.drop_duplicates(subset=['Address'])['API Lat'].value_counts()>1].sum()/len(geo.drop_duplicates(subset=['Address'])))*100:.1f}% of the addresses have the same API lat,lon coordinate with somewhere else.\n")
print(f"In comparison, {(new_geo.drop_duplicates(subset=['Address'])['GIS Lat'].value_counts()[new_geo.drop_duplicates(subset=['Address'])['GIS Lat'].value_counts()>1].sum()/len(geo.drop_duplicates(subset=['Address'])))*100:.1f}% of the addresses have the same GIS lat,lon coordinate with somewhere else.\n")
print(f"In terms of coverage, the API method has {new_geo['API Lat'].isnull().mean()*100:.01f}% missing values, while the GIS method has {new_geo['GIS Lat'].isnull().mean()*100:.01f}% missing values.\n")

In [None]:
valid_new_geo = new_geo[new_geo.Coordinates!=(np.nan,np.nan)].copy()

max_num_of_res = valid_new_geo['Number of Residents'].max()

list_of_points = valid_new_geo.Coordinates.tolist()

residents_at_points = valid_new_geo['Number of Residents'].tolist()

max_r = (0.5+np.log(max(residents_at_points))**1.2)

center_loc = np.mean( list_of_points ,axis=0).tolist()

my_map = folium.Map(location=center_loc,zoom_start=15)# ,tiles="CartoDB dark_matter")


plot_streets = valid_new_geo['Street Name New'].drop_duplicates().tolist()
for s in plot_streets:

    line_weight = 8 if s.endswith('Avenue') or s.endswith('Broadway') else 5
    street_points = full[full.street_name==s].start_end_coordinates.tolist()
    folium.PolyLine(street_points, color='dimgrey', weight=line_weight, alpha = 0.1, popup=s).add_to(my_map)
    

for i in range(len(list_of_points)):
    
    pt = list_of_points[i]
    num_of_res = residents_at_points[i]
        
    r = 0.5+np.log(num_of_res)**1.2
    
    color_ratio = r/max_r
    rgb_tuple = (int(255/2 + 255/2*color_ratio),int(0),int(255/2 - 255/2*color_ratio))
    hex_code = '#%02x%02x%02x' % rgb_tuple
    
    folium.Circle(location=pt,radius=r,color=hex_code,alpha=0.3 ).add_to(my_map)
    


my_map

In [None]:
my_map.save('demo.html')

#### Point order problem

In [None]:
# this_street = '10th Avenue'
# full[full.street_name==this_street]

In [None]:
# full[full.street_name==this_street]['start_end_coordinates'].apply(lambda li: li[0][0]).plot()

In [None]:
# full[full.street_name==this_street]['start_end_coordinates'].apply(lambda li: li[0][1]).plot()

In [None]:
# full[full.street_name==this_street].segment_direction.plot()

In [None]:
# street_points_list = full[full.street_name==this_street].start_end_coordinates.tolist() 

# ### Visual inspect

# colors = ['red','orange','green','blue','purple','white','brown']

# my_map = folium.Map(location=np.mean(flatten_list(street_points_list),axis=0).tolist(),
#                         zoom_start= 17 ,
#                         tiles="CartoDB dark_matter")

# for i in range(len(street_points_list)):
#     street_points = street_points_list[i]
#     # print(i,street_points)
#     folium.PolyLine(street_points, color=colors[i%len(colors)], weight=(1+i%len(colors))).add_to(my_map)

# #[[40.710512, -73.992112], [40.710864, -73.99232], [40.711312, -73.992416]]
    
# my_map

In [None]:
# street_points_list = raw[raw['Name']==this_street.split()[0]].sort_values('L_f_add').reset_index(drop=True).coordinates.tolist()


# ### Visual inspect

# center = np.mean(flatten_list(street_points_list),axis=0).tolist()

# colors = ['red','orange','green','blue','purple','white','brown']

# my_map = folium.Map(location=center,
#                         zoom_start= 17 ,
#                         tiles="CartoDB dark_matter")

# for i in range(len(street_points_list)):
#     street_points = street_points_list[i]
#     # print(i,street_points)
#     folium.PolyLine(street_points, color=colors[i%len(colors)], weight=(1+i%len(colors))).add_to(my_map)

# my_map

**END** Point order problem <br><br><br><br><br>

#### Old Geo

In [None]:
## old_geo = pd.read_csv('/Users/timsmac/Desktop/us_census_data/aggregate/Geoloaded_1910_New York_Manhattan Ward 7.csv')

## old_geo = old_geo[['Street Name','House Number','Street Name New', 'House Number New', 'Address', 'Lon', 'Lat', 'Enumeration District Number', 'Ward of City']]

## old_geo = old_geo.drop_duplicates(['Street Name','House Number'])

## old_geo.to_csv('_old_geo_for_comparison.csv',index=False)

**END** Old Geo<br><br><br><br><br>

In [None]:
# li = [street_center_position,target_point]

# my_map = folium.Map(location=np.mean(li,axis=0).tolist(), zoom_start= 20, tiles="CartoDB dark_matter")

# for i in range(len(li)):
#     pt = li[i]
#     folium.Circle(pt, radius=2,color=colors[i%len(colors)]).add_to(my_map)
    
# my_map

In [None]:
# # # Check and verify that there is overlapping building num range problem

# data = df.copy()

# data['building_num_range'] = df[['min_building_num','max_building_num']].apply(lambda row: list(range(row['min_building_num'],row['max_building_num'])),axis=1)

# overlapping_range_detection = data.groupby('street_name').agg({'building_num_range':list})

# overlapping_range_detection['building_num_range'] = overlapping_range_detection['building_num_range'].apply(flatten_list)

# overlapping_range_detection['building_num_range'] = overlapping_range_detection['building_num_range'].apply(lambda li: Counter(li))

# overlapping_range_detection['building_num_range'] = overlapping_range_detection['building_num_range'].apply(lambda counter: [k for k, v in counter.items() if v > 1])

# overlapping_range_detection['building_num_range'].apply(len).value_counts()

# overlapping_range_detection['building_num_range'][overlapping_range_detection['building_num_range'].apply(len)>0][:100]

In [None]:
# # # Fix max_building_num

# non_overlapping_range_df = pd.DataFrame()

# for street_name in df.street_name.unique().tolist():
    
#     data = df[df.street_name==street_name]

#     data['next_segment_min_building_num'] = data.min_building_num.shift(-1)

#     data['max_building_num'] = data.apply(lambda row: row['max_building_num'] if np.isnan(row['next_segment_min_building_num']) else row['max_building_num'] if row['max_building_num']<row['next_segment_min_building_num'] else row['next_segment_min_building_num']-1  , axis=1).apply(int)

#     data = data.drop('next_segment_min_building_num',axis=1)
    
#     non_overlapping_range_df = non_overlapping_range_df.append(data, ignore_index=True)
    

In [None]:
# ### Visual inspect

# street_points_list = df.coordinates.tolist()

# all_streets_center_loc = np.mean([np.mean(li, axis=0).tolist() for li in street_points_list],axis=0)

# my_map = folium.Map(location=all_streets_center_loc,
#                         zoom_start= 13 ,
#                         tiles="CartoDB dark_matter")

# for street_points in street_points_list:

#     folium.PolyLine(street_points, color="red", weight=1).add_to(my_map)

# my_map

In [None]:
# glob('1910MAN/*')

# df = pd.read_csv('1910MAN/1910MANstCLN_W7.csv')

# geo_relevant_cols = ['DataID', 'RecordId', 'HouseHoldId', 'Street Name', 'St_MOD',
#        'ADDY_CLN', 'ZIP', 'City', 'State', 'Country', 'House Number', 'HoNo_1',
#        'Dwelling Number','Enumeration District Number', 'Ward of City']

# df = df[geo_relevant_cols]

# df.ZIP.value_counts()

# df.RecordId.nunique()

# found = pd.read_csv('1910MAN/1910MANstCLN_W7_found.csv')[['RecordId','Longitude', 'Latitude', 'Side']]

# found

# found.Side.value_counts()

# fdf = df[df.RecordId.isin( set(found.RecordId.tolist()) )]

# fdf['House Number'].value_counts()

# fdf.St_MOD.value_counts()

# ndf = df[~df.RecordId.isin( set(found.RecordId.tolist()) )]

# ndf['House Number'].value_counts()

# ndf.St_MOD.value_counts()

In [None]:
# address = 'Macombs Place 16'

# street_name = ' '.join(address.split()[:-1])
# building_number = address.split()[-1]

# # np.cumsum(segment_lengths)