In [2]:
#closest to location
# metrics os closiness by vector

import pandas as pd
import numpy as np
import random
import math
import scipy.spatial.distance as ssd

In [37]:
ls = [{'lat':59.940069, 'lon': 30.320465, 'value':5}, {'lat':59.640069, 'lon': 30.420465, 'value':10}]

for i in np.arange(0.0, 1.0, 0.01):
    ls.append({'lat':59.9400 + i, 'lon': 30.320465+i, 'value': random.randint(0,15)})
    
data = pd.DataFrame(ls)

In [38]:
data.head()

Unnamed: 0,lat,lon,value
0,59.940069,30.320465,5
1,59.640069,30.420465,10
2,59.94,30.320465,7
3,59.95,30.330465,3
4,59.96,30.340465,8


In [39]:
lats = data['lat']
lons = data['lon']
data['location'] = list(zip(data['lat'], data['lon']))

def find_closest_location(df, lat, lon):
    if (lat, lon) in df['location']:
        return lat, lon
    df['evc'] = df['location'].map(lambda x: math.sqrt((x[0]-lat)**2 + (x[1]-lon)**2))
    
    df = df.sort_values(['evc'], ascending=True)
    if len(df) == 0:
        return None, None
    return df.iloc[0]['location']

In [40]:
find_closest_location(data, 59.940069, 30.340465)

(59.949999999999996, 30.330465)

In [47]:
features = [{'lat':59.940069, 'lon': 30.320465, 'f1':5, 'f2':8}, {'lat':59.640069, 'lon': 30.420465, 'f1':10, 'f2':2}]

for i in np.arange(0.0, 1.0, 0.01):
    features.append({'lat':59.9400 + i, 'lon': 30.320465+i, 'f1': random.randint(0,15), 'f2': random.randint(0,15)})
    
data2= pd.DataFrame(features)

In [137]:
def prepare(data2):
    data2['location'] = list(zip(data2['lat'], data['lon']))
    del data2['lat']
    del data2['lon']
    feature_columns = [x for x in data2.columns if x!= 'location']
    return data2, feature_columns

In [142]:
def count_distances(df, columns, location, mode):
    vdf = df[df['location'] == location]
    vector = vdf[columns].as_matrix()
    matrix = df[columns].as_matrix()
    if mode == 'cos':
        res = count_cosine(matrix, vector)
    else:
        res = count_euclid(matrix, vector)
    df['metrics'] = res
    df = df.sort_values(['metrics'])
    return df.drop(df.index[[0]])

In [143]:
def count_cosine(matrix, vector):
    cos = []
    for i in matrix:
        cos.append(ssd.cosine(i, vector))
    return cos

In [144]:
def count_euclid(matrix, vector):
    cos = []
    for i in matrix:
        cos.append(ssd.euclidean(i, vector))
    return cos

In [145]:
d = count_distances(data2, feature_columns,  (59.949999999999996, 30.330465), 'cos')

In [146]:
d.head()

Unnamed: 0,f1,f2,location,metrics
7,7,4,"(59.99, 30.370465)",4e-05
86,10,6,"(60.78, 31.160465)",7.6e-05
56,9,5,"(60.48, 30.860465)",0.00022
57,13,7,"(60.49, 30.870465)",0.000582
28,12,8,"(60.2, 30.580465)",0.001795


In [164]:
educ = pd.read_csv('data/educ_SPB.csv')
educ.columns = ['amenity', 'name', 'lat', 'lon']
educ['location'] = list(zip(educ['lat'], educ['lon']))

In [None]:
ds1 = pd.read_csv('data/olya_features.csv')
ds1 = ds1[['lng', 'lat', 'criminality_score', 'complaints_score']]
ds1.columns = ['lon', 'lat', 'crime', 'complain']
ds1['location'] = list(zip(ds1['lat'], ds1['lon']))

In [170]:
def find_closest_min_location(df, lat, lon):
    if (lat, lon) in df['location']:
        return lat, lon
    df['evc'] = df['location'].map(lambda x: math.sqrt((x[0]-lat)**2 + (x[1]-lon)**2))
    
    df = df.sort_values(['evc'], ascending=True)
    if len(df) == 0:
        return None, None
    return df.iloc[0]['evc']

In [171]:
ds1['educ_dist'] = ds1['location'].map(lambda x: find_closest_min_location(educ,  x[0], x[1]))

In [178]:
rest = pd.read_csv('data/rest_SPB.csv')
rest.columns = ['amenity', 'name', 'lat', 'lon']
rest['location'] = list(zip(rest['lat'], rest['lon']))
ds1['rest_dist'] = ds1['location'].map(lambda x: find_closest_min_location(rest,  x[0], x[1]))

In [179]:
infrastr = pd.read_csv('data/infrastr_SPB.csv')
infrastr.columns = ['amenity', 'name', 'lat', 'lon']
infrastr['location'] = list(zip(infrastr['lat'], infrastr['lon']))
ds1['infrastr_dist'] = ds1['location'].map(lambda x: find_closest_min_location(infrastr,  x[0], x[1]))

In [181]:
med = pd.read_csv('data/med_SPB.csv')
med.columns = ['bbb', 'name', 'lat', 'lon']
med['location'] = list(zip(med['lat'], med['lon']))
ds1['med_dist'] = ds1['location'].map(lambda x: find_closest_min_location(med,  x[0], x[1]))

In [188]:
ds1.head()

Unnamed: 0,lon,lat,crime,complain,location,educ_dist,rest_dist,infrastr_dist,med_dist,shops_dist
0,30.229946,59.844715,0.679315,1.117699,"(59.844715, 30.229946)",0.009675,0.003157,0.002949,0.003971,0.001185
1,30.243179,59.930236,0.9105,1.172718,"(59.9302363, 30.2431786)",0.000584,0.001834,0.002258,0.00057,0.001469
2,30.324209,60.020927,0.9105,1.223731,"(60.020927, 30.324209)",0.002645,0.003006,0.003789,0.002193,0.00171
3,30.316559,59.989446,0.9105,1.240615,"(59.989446, 30.3165589)",0.006782,0.002118,0.006692,0.007717,0.00095
4,30.473965,60.059112,-1.31855,-1.798249,"(60.0591117, 30.4739647)",0.048342,0.005432,0.014119,0.001383,0.002467


In [187]:
shops = pd.read_csv('data/shops_SPB.csv')
shops.columns = ['shop', 'lat', 'lon']
shops['location'] = list(zip(shops['lat'], shops['lon']))
ds1['shops_dist'] = ds1['location'].map(lambda x: find_closest_min_location(shops,  x[0], x[1]))

In [189]:
ds1.to_csv('crime_complain_educ_rest_infrastr_med_shops.csv', encoding='utf-8', index=False)

In [8]:
toadd = pd.read_csv('new_features.csv')
toadd.head()

Unnamed: 0,lon,lat,crime,complain,location,bus_dist,subw_dist
0,30.229946,59.844715,0.679315,1.117699,"(59.844715000000001, 30.229946000000002)",0.004057,0.021782
1,30.243179,59.930236,0.9105,1.172718,"(59.930236300000004, 30.243178600000004)",0.002476,0.02025
2,30.324209,60.020927,0.9105,1.223731,"(60.020926999999993, 30.324209000000003)",0.003348,0.009749
3,30.316559,59.989446,0.9105,1.240615,"(59.989445999999994, 30.316558899999997)",0.001341,0.016235
4,30.473965,60.059112,-1.31855,-1.798249,"(60.05911170000001, 30.4739647)",0.031441,0.033056


In [15]:
data = pd.read_csv('NYC.csv')
del data['Unnamed: 0']
#data['location'] = list(zip(data['lat'], data['lon']))
data.head()

Unnamed: 0,lon,lat,location,shops_dist,rest_dist,bus_dist,subway_dist
0,-73.833067,40.599883,"(40.599882529687292, -73.8330671778007)",0.014949,128.535858,161.367461,119.351828
1,-73.908247,40.670008,"(40.670007739582289, -73.908246570866908)",0.000307,128.628525,161.470177,119.404417
2,-74.390799,40.692959,"(40.692958626820371, -74.390799032480444)",0.213484,129.096013,161.825522,119.861494
3,-73.838078,40.926156,"(40.926156226328992, -73.838077568846558)",0.000929,128.638233,161.603564,119.265765
4,-74.551356,41.075077,"(41.075076848671216, -74.551356172625461)",0.216853,129.363333,162.21017,119.910276


In [11]:
data['bus_dist'] = toadd['bus_dist']
data['subw_dist'] = toadd['subw_dist']

In [12]:
data.head()

Unnamed: 0,lon,lat,crime,complain,location,educ_dist,rest_dist,infrastr_dist,med_dist,shops_dist,food_retail,rest,educ,infrastr,med,district_year,year,floors,bus_dist,subw_dist
0,30.229946,59.844715,0.679315,1.117699,"(59.844715000000001, 30.229946000000002)",0.009675,0.003157,0.002949,0.003971,0.001185,1.387198,0.315501,-0.693321,0.909674,0.475634,0.745862,0.630134,-0.28071,0.004057,0.021782
1,30.243179,59.930236,0.9105,1.172718,"(59.930236300000004, 30.243178600000004)",0.000584,0.001834,0.002258,0.00057,0.001469,1.497607,1.029705,2.453418,1.205592,1.242706,0.160107,-0.91683,0.038097,0.002476,0.02025
2,30.324209,60.020927,0.9105,1.223731,"(60.020926999999993, 30.324209000000003)",0.002645,0.003006,0.003789,0.002193,0.00171,-1.130144,-1.081087,-0.693321,-0.933382,-1.058509,0.642959,0.583256,0.994515,0.003348,0.009749
3,30.316559,59.989446,0.9105,1.240615,"(59.989445999999994, 30.316558899999997)",0.006782,0.002118,0.006692,0.007717,0.00095,1.304445,1.08001,0.157048,1.068238,1.373051,0.133722,-0.565247,-0.28071,0.001341,0.016235
4,30.473965,60.059112,-1.31855,-1.798249,"(60.05911170000001, 30.4739647)",0.048342,0.005432,0.014119,0.001383,0.002467,0.478818,-0.000539,-0.693321,-0.933382,-0.291438,-0.642007,-0.940269,-0.918322,0.031441,0.033056


In [13]:
data.to_csv('all_and_bus_subw.csv', index = False)

In [16]:
data = pd.read_csv('NYC.csv')
del data['Unnamed: 0']
#data['location'] = list(zip(data['lat'], data['lon']))
data.head()

Unnamed: 0,lon,lat,location,shops_dist,rest_dist,bus_dist,subway_dist
0,-73.833067,40.599883,"(40.599882529687292, -73.8330671778007)",0.014949,128.535858,161.367461,119.351828
1,-73.908247,40.670008,"(40.670007739582289, -73.908246570866908)",0.000307,128.628525,161.470177,119.404417
2,-74.390799,40.692959,"(40.692958626820371, -74.390799032480444)",0.213484,129.096013,161.825522,119.861494
3,-73.838078,40.926156,"(40.926156226328992, -73.838077568846558)",0.000929,128.638233,161.603564,119.265765
4,-74.551356,41.075077,"(41.075076848671216, -74.551356172625461)",0.216853,129.363333,162.21017,119.910276


In [32]:
ny = pd.read_csv('NYC_crimes_complaints_features.csv')

In [33]:
ny['shops_dist'] = data['shops_dist']
#ny['rest_dist'] = data['rest_dist']
#ny['bus_dist'] = data['bus_dist']
#ny['subw_dist'] = data['subway_dist']

In [34]:
ny.to_csv('crimes_complaints_shops_NY.csv', index=False)

In [23]:
spb = pd.read_csv('all_and_bus_subw.csv')
spb.columns

Index(['lon', 'lat', 'crime', 'complain', 'location', 'educ_dist', 'rest_dist',
       'infrastr_dist', 'med_dist', 'shops_dist', 'food_retail', 'rest',
       'educ', 'infrastr', 'med', 'district_year', 'year', 'floors',
       'bus_dist', 'subw_dist'],
      dtype='object')

In [24]:
ny.columns

Index(['lon', 'lat', 'crime', 'complain', 'shops_dist', 'rest_dist',
       'bus_dist', 'subway_dist'],
      dtype='object')