In [5]:
import pandas as pd
import pickle
import yaml
from scipy.spatial.distance import cdist
from shapely.ops import nearest_points
from shapely.geometry import Point
import geopandas as gpd

In [6]:
df_subways = pd.read_csv("../data/external/subways.csv", index_col=False)

In [7]:
df_subways = df_subways.round(2) 

In [8]:
df_subways.shape

(250, 3)

In [9]:
df_subways.head(5)

Unnamed: 0,o_lat,o_long,subway_name
0,39.93,116.18,古城
1,39.91,116.19,八角游乐园
2,39.91,116.21,八宝山
3,39.91,116.24,玉泉路
4,39.91,116.25,五棵松


In [10]:
def extract_Points_df(df, lat_column, long_column, crs={'init', 'epsg:4326'}):
    df_copy = df.copy()
    geometry = [Point(xy) for xy in zip(df_copy[long_column], df_copy[lat_column])]
    Points = gpd.GeoDataFrame(df_copy, crs=crs, geometry=geometry)
    return Points

In [11]:
gdf_subways = extract_Points_df(df_subways, lat_column="o_lat", long_column="o_long")

In [12]:
gdf_subways.sample(5)

Unnamed: 0,o_lat,o_long,subway_name,geometry
98,40.13,116.64,南法信,POINT (116.64 40.13)
222,39.97,116.32,魏公村,POINT (116.32 39.97)
124,39.85,116.45,分钟寺,POINT (116.45 39.85)
178,39.91,116.46,永安里,POINT (116.46 39.91)
244,39.74,116.33,黄村西大街,POINT (116.33 39.74)


In [13]:
df_queries_test = pd.read_pickle('../data/processed/df_test.pickle')

In [14]:
gdf_queries_test = extract_Points_df(df_queries_test, lat_column="o_lat", long_column="o_long")

In [15]:
gdf_queries_test.sample(5)

Unnamed: 0,distance_plan,eta,price,transport_mode,sid,plan_time,pid,req_time,o_long,o_lat,d_long,d_lat,distance_query,geometry
304220,3661,631,700.0,3,1271177,2018-12-02 20:11:29,167799.0,2018-12-02 20:11:29,116.33,39.99,116.35,40.0,2.037188,POINT (116.33 39.99)
295751,65063,9800,1500.0,1,1248523,2018-12-02 15:15:03,185318.0,2018-12-02 15:15:03,117.1,40.1,116.68,39.89,42.780876,POINT (117.1 40.1)
88161,6934,1893,300.0,2,2066160,2018-12-07 18:23:04,,2018-12-07 18:23:04,116.37,39.91,116.42,39.9,4.417422,POINT (116.37 39.91)
236971,17301,2120,700.0,3,1336750,2018-12-04 13:21:58,,2018-12-04 13:21:58,116.46,40.01,116.32,39.97,12.755111,POINT (116.46 40.01)
390918,26487,2934,700.0,3,1278890,2018-12-02 12:43:35,193444.0,2018-12-02 12:43:35,116.45,39.86,116.41,40.05,21.37151,POINT (116.45 39.86)


In [16]:
pts3 = gdf_subways.geometry.unary_union
def near(point, pts=pts3):
    # find the nearest point and return the corresponding Place value
    nearest = gdf_subways.geometry == nearest_points(point, pts)[1]
    #return gdf_subways[nearest].subway_name.get_values()[0]
    return "%.4f" % (gdf_subways[nearest].geometry.get_values()[0].distance(point)*10.0)

In [17]:
test = gdf_queries_test.sample(100)

In [154]:
test['dist_nearest_sub'] = test.apply(lambda row: near(row.geometry), axis=1)

In [155]:
test.sample(5)

Unnamed: 0,distance_plan,eta,price,transport_mode,sid,plan_time,pid,req_time,o_long,o_lat,d_long,d_lat,distance_query,geometry,dist_nearest_sub
99846,4911,1484,700.0,6,1272695,2018-12-02 20:49:01,150302.0,2018-12-02 20:49:01,116.47,40.0,116.5,40.01,2.791919,POINT (116.47 40),0.0
8512,645,135,700.0,3,1410754,2018-12-06 12:38:34,161049.0,2018-12-06 12:38:34,116.41,39.97,116.41,39.96,1.11034,POINT (116.41 39.97),0.1
14381,53992,4670,15200.0,4,2038769,2018-12-07 12:07:06,209406.0,2018-12-07 12:07:06,116.28,40.25,116.32,39.89,40.118329,POINT (116.28 40.25),1.5297
246738,13022,2691,400.0,2,446132,2018-12-01 22:11:28,,2018-12-01 22:11:28,116.44,39.9,116.32,39.89,10.322827,POINT (116.44 39.9),0.1
291381,69,59,700.0,5,1387090,2018-12-03 19:18:34,,2018-12-03 19:18:34,116.33,39.99,116.33,39.99,0.0,POINT (116.33 39.99),0.1


In [157]:
test.describe()

Unnamed: 0,distance_plan,eta,price,transport_mode,sid,pid,o_long,o_lat,d_long,d_lat,distance_query
count,100.0,100.0,100.0,100.0,100.0,56.0,100.0,100.0,100.0,100.0,100.0
mean,16325.73,2944.52,1608.0,4.46,1261166.0,160114.571429,116.4112,39.9473,116.4041,39.9533,11.781714
std,15716.438645,2076.688382,2527.572397,2.606984,412913.6,35215.428531,0.104139,0.093861,0.117448,0.105888,12.037904
min,64.0,54.0,200.0,1.0,425368.0,104802.0,116.2,39.72,116.01,39.72,0.0
25%,4311.25,1364.75,500.0,2.75,1142264.0,120756.25,116.3375,39.9,116.32,39.9,2.379713
50%,11932.0,2452.0,700.0,4.0,1374060.0,167622.0,116.41,39.93,116.39,39.93,7.583025
75%,23732.0,4187.75,1325.0,6.25,1411132.0,190016.5,116.4625,39.99,116.46,39.99,17.721744
max,65830.0,9285.0,15200.0,11.0,2071396.0,212391.0,116.68,40.25,116.74,40.35,57.038325


In [25]:
dftest = df_queries_test.sample(500)
def add_dist_nearest_subway(dataf):
    '''
    Creates 1 new column with the distance to the nearest subway station (from subways.csv)
    '''
    def extract_Points_df(df, lat_column, long_column, crs={'init', 'epsg:4326'}):
        df_copy = df.copy()
        geometry = [Point(xy) for xy in zip(df_copy[long_column], df_copy[lat_column])]
        Points = gpd.GeoDataFrame(df_copy, crs=crs, geometry=geometry)
        return Points

    df_subways = pd.read_csv("../data/external/subways.csv", index_col=False).round(2)

    if 'o_lat' not in dataf or 'o_long' not in dataf:
        logger.error("The dataframe doesn't have the coordinates in the correct format. They need to be 'o_lat' and 'o_long'.")

    gdf_subways = extract_Points_df(df_subways, lat_column="o_lat", long_column="o_long")
    gdf_dataf = extract_Points_df(dataf, lat_column="o_lat", long_column="o_long")

    pts3 = gdf_subways.geometry.unary_union
    
    # https://gis.stackexchange.com/questions/222315/geopandas-find-nearest-point-in-other-dataframe
    def near(point, pts=pts3):
        # find the nearest point and return the corresponding Place value
        nearest = gdf_subways.geometry == nearest_points(point, pts)[1]
        return "%.3f" % (gdf_subways[nearest].geometry.get_values()[0].distance(point)*10.0)
    
    gdf_dataf['dist_nearest_sub'] = gdf_dataf.apply(lambda row: near(row.geometry, pts3), axis=1)
    gdf_dataf = gdf_dataf.drop('geometry', 1)

    return gdf_dataf


In [26]:
output = add_dist_nearest_subway(dftest)

In [27]:
output.sample(10)

Unnamed: 0,distance_plan,eta,price,transport_mode,sid,plan_time,pid,req_time,o_long,o_lat,d_long,d_lat,distance_query,dist_nearest_sub
420716,31050,6370,800.0,7,1249429,2018-12-02 15:37:42,216007.0,2018-12-02 15:37:42,116.65,39.89,116.41,39.87,20.650097,0.424
332299,3548,622,1300.0,4,1395663,2018-12-03 09:05:23,191435.0,2018-12-03 09:05:23,116.3,40.09,116.31,40.07,2.378891,0.141
19329,15094,3250,1700.0,10,1330014,2018-12-04 10:44:12,,2018-12-04 10:44:12,116.41,39.85,116.46,39.93,9.858467,0.0
105871,27195,5341,700.0,3,1440102,2018-12-06 17:45:32,131973.0,2018-12-06 17:45:32,116.27,39.93,116.45,39.87,16.773046,0.1
6932,9075,2204,700.0,3,443134,2018-12-01 17:31:09,202430.0,2018-12-01 17:31:09,116.4,39.93,116.47,39.95,6.381634,0.0
269176,2204,335,700.0,3,1280075,2018-12-02 10:05:26,196192.0,2018-12-02 10:05:26,116.27,39.84,116.26,39.84,0.855928,0.283
16180,11327,1987,700.0,3,1340655,2018-12-04 18:12:22,,2018-12-04 18:12:22,116.45,39.93,116.39,39.96,6.114681,0.1
388009,37701,5031,700.0,2,1342960,2018-12-04 09:32:21,112198.0,2018-12-04 09:32:21,116.3,39.91,116.56,39.78,26.523902,0.0
170065,25395,3243,8400.0,4,1271635,2018-12-02 14:14:53,167770.0,2018-12-02 14:14:53,116.17,39.92,116.37,39.86,18.357602,0.141
149557,8111,1585,2600.0,4,1346292,2018-12-04 18:37:15,,2018-12-04 18:37:15,116.31,39.89,116.35,39.92,4.774428,0.1
