In [1]:
import os
import folium
import random
import collections
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import DBSCAN
from shapely.geometry import Polygon, Point

In [None]:
bikes = pd.read_csv('../cleaned_data/bikes_7-9.csv')

In [2]:
inbikes = pd.read_csv('../cleaned_data/inbikes.csv')
outbikes = pd.read_csv('../cleaned_data/outbikes.csv')
fences = pd.read_csv('../cleaned_data/fence_position.csv')

In [3]:
fences.sample(5)

Unnamed: 0,FENCE_ID,LONGITUDE_0,LATITUDE_0,LONGITUDE_1,LATITUDE_1,LONGITUDE_2,LATITUDE_2,LONGITUDE_3,LATITUDE_3,LONGITUDE_4,LATITUDE_4,LATITUDE,LONGITUDE,ROAD,AREA,FID
3480,厦禾路(厦门六中与公园东路段)_R_14,118.084861,24.464837,118.084826,24.464828,118.084821,24.464844,118.084856,24.464853,118.084861,24.464837,24.46484,118.084841,厦禾路厦门六中与公园东路段,3.452099,3480
7874,洪前路0_R_A30007,118.153592,24.48299,118.153542,24.482996,118.153544,24.483013,118.153595,24.483006,118.153592,24.48299,24.483001,118.153568,洪前路,4.967035,7874
13770,鹭江道_R_A13002,118.073058,24.467635,118.073008,24.467625,118.073004,24.467643,118.073054,24.467653,118.073058,24.467635,24.467639,118.073031,鹭江道,5.481748,13770
1689,会展南二路_R_A07004,118.166743,24.45706,118.166708,24.457026,118.166697,24.457035,118.166732,24.457069,118.166743,24.45706,24.457048,118.16672,会展南二路,4.030318,1689
4355,嘉禾路0_L_A19026,118.119243,24.507165,118.119216,24.507192,118.119228,24.507202,118.119255,24.507175,118.119243,24.507165,24.507184,118.119236,嘉禾路,3.469235,4355


In [4]:
random.seed(0)
def random_points_within(poly, num_points):
    min_x, min_y, max_x, max_y = poly.bounds

    points = []

    while len(points) < num_points:
        random_point = Point([random.uniform(min_x, max_x), random.uniform(min_y, max_y)])
        if (random_point.within(poly)):
            points.append(random_point)

    return points


fid_posdic = dict()
for i in range(len(fences)):
    area = fences['AREA'][i]
    p1, p2, p3, p4, p5 = (fences['LATITUDE_0'][i], fences['LONGITUDE_0'][i]), (fences['LATITUDE_1'][i], fences['LONGITUDE_1'][i]), (fences['LATITUDE_2'][i], fences['LONGITUDE_2'][i]), (fences['LATITUDE_3'][i], fences['LONGITUDE_3'][i]), (fences['LATITUDE_4'][i], fences['LONGITUDE_4'][i])
    base = [p1, p2, p3, p4]
    if area > 10:
        num_points = area // 3 + 1
        poly = Polygon([p1, p2, p3, p4, p5])
        points = random_points_within(poly, num_points)
        for p in points:
            base.append((p.x, p.y))
    fid_posdic[i] = base


In [5]:
fid_reverse_proj = collections.defaultdict(list)
x = set()
for k, v in fid_posdic.items():
    fence_id = k
    for pos in v:
        fid_reverse_proj[pos].append(k)
        x.add(pos)
x = list(x)

In [6]:
fence_cluster = DBSCAN(eps = 0.0002, min_samples = 1).fit_predict(x)

In [7]:
cluster_cnts = collections.Counter(fence_cluster)
point_cnts = sorted([(v, k) for k, v in cluster_cnts.items()], reverse = True)
print("Number of clusters")
print(str(len(cluster_cnts)))
print("Max/min number of points in clusters")
print(str(point_cnts[0]))
print(str(point_cnts[-1]))

Number of clusters
4672
Max/min number of points in clusters
(422, 9)
(4, 2)


In [8]:
fence_belong = dict()
for i in range(len(fence_cluster)):
    point = x[i]
    belong = fence_cluster[i]
    curfences = fid_reverse_proj[point]
    for f in curfences:
        if f in fence_belong and fence_belong[f] != belong:
            print("ERROR")
            print(fence_belong[f])
        fence_belong[f] = belong

In [9]:
clusters = []
for i in range(len(fences)):
    clusters.append(fence_belong[i])
fences['CLUSTER'] = clusters

In [10]:
cluster_area = fences.groupby('CLUSTER').agg({'AREA':sum})
cluster_area

Unnamed: 0_level_0,AREA
CLUSTER,Unnamed: 1_level_1
0,590.892024
1,28.823845
2,7.653978
3,18.079511
4,65.842023
...,...
4667,9.083773
4668,4.236426
4669,6.010068
4670,6.834900


In [11]:
in_outliers = inbikes[(inbikes['NEAREST_FENCE'] == -1) | (inbikes['DISTANCE'] > 30)]
out_outliers = outbikes[(outbikes['NEAREST_FENCE'] == -1)|(outbikes['DISTANCE'] > 30)]
inbikes = inbikes[(inbikes['NEAREST_FENCE'] != -1) & (inbikes['DISTANCE'] <= 30)]
outbikes = outbikes[(outbikes['NEAREST_FENCE'] != -1) & (outbikes['DISTANCE'] <= 30)]

In [12]:
bikes_in = pd.merge(inbikes,fences, left_on='NEAREST_FENCE', right_on='FID',how='left')
bikes_in.drop('NEAREST_FENCE', axis = 1, inplace = True)
bikes_out = pd.merge(outbikes,fences, left_on='NEAREST_FENCE', right_on='FID',how='left')
bikes_out.drop('NEAREST_FENCE', axis = 1, inplace = True)

In [13]:
in_flow = bikes_in.groupby('CLUSTER').agg({'CLUSTER':'count'})
in_flow

Unnamed: 0_level_0,CLUSTER
CLUSTER,Unnamed: 1_level_1
0,498
1,17
2,63
3,98
4,46
...,...
4667,51
4668,4
4669,12
4670,16


In [14]:
out_flow = bikes_out.groupby('CLUSTER').agg({'CLUSTER':'count'})

In [15]:
cluster_df = pd.DataFrame()
cluster_df['CLUSTER'] = cluster_area.index
cluster_df['AREA'] = cluster_area.values.ravel()
inflow = [0]*len(cluster_cnts)
for k in cluster_area.index:
    if k in in_flow.index:
        inflow[k] = in_flow.loc[k].values[0]
cluster_df['INFLOW'] = inflow
outflow = [0]*len(cluster_cnts)
for k in cluster_area.index:
    if k in out_flow.index:
        outflow[k] = out_flow.loc[k].values[0]
cluster_df['OUTFLOW'] = outflow
cluster_df['INFLOW_DENSITY'] = cluster_df.apply(lambda x: x['INFLOW']/x['AREA'], axis = 1)
cluster_df['FLOW'] = cluster_df.apply(lambda x: x['INFLOW'] - x['OUTFLOW'], axis = 1)
cluster_df['FLOW_DENSITY'] = cluster_df.apply(lambda x: (x['INFLOW'] - x['OUTFLOW'])/x['AREA'], axis = 1)

In [16]:
cluster_df.head(5)

Unnamed: 0,CLUSTER,AREA,INFLOW,OUTFLOW,INFLOW_DENSITY,FLOW,FLOW_DENSITY
0,0,590.892024,498,1042,0.842794,-544.0,-0.920642
1,1,28.823845,17,34,0.589789,-17.0,-0.589789
2,2,7.653978,63,37,8.231014,26.0,3.396926
3,3,18.079511,98,62,5.420501,36.0,1.991204
4,4,65.842023,46,145,0.698642,-99.0,-1.503599


In [17]:
sort_by_inflow = cluster_df.sort_values("INFLOW", ascending = False)
top_40_inflow = sort_by_inflow['CLUSTER'].values[:40]
top_40_inflow

array([ 806,    9, 1492, 2094, 1172, 1211, 3309, 1384, 1498,   75, 1608,
       1290, 1037, 1215,  330,    0, 1685,  629, 2037, 1944, 1726, 1212,
        472,  595,  501, 1770,  263,  337,  953, 2431, 2921, 1711,  376,
       2982,   71,   94,  579, 3863,  188, 1007], dtype=int64)

In [18]:
sort_by_inflow_density = cluster_df.sort_values("INFLOW_DENSITY", ascending = False)
top_40_inflow_density = sort_by_inflow_density['CLUSTER'].values[:40]
top_40_inflow_density

array([1608, 2037, 3127,   82, 3309,  660, 3863, 2921,  806, 4229, 3526,
       3642,  949, 3747, 1265, 4413, 4298, 1212, 3382, 2901, 3826,  547,
       3277, 3292, 1688, 2623, 4001, 3907, 3211, 3041, 1803, 4407,   71,
       4007, 1273, 4037, 2048, 2257,  419, 1290], dtype=int64)

In [19]:
sort_by_flow = cluster_df.sort_values("FLOW", ascending = False)
top_40_flow = sort_by_flow['CLUSTER'].values[:40]
top_40_flow

array([1492,  806, 2094, 1608,   75, 1172,  629, 1215, 1290, 1384, 3309,
        337,  953, 2037, 1726, 1944, 1007,   71, 2982, 3127,  579, 1685,
       1688, 1711, 3382,  419, 1212, 1173,  472,  660,   36, 3167, 1564,
       1649, 1265,  605, 1829, 3786, 1767,   23], dtype=int64)

In [20]:
sort_by_flow_density = cluster_df.sort_values("FLOW_DENSITY", ascending = False)
top_40_flow_density = sort_by_flow_density['CLUSTER'].values[:40]
top_40_flow_density

array([1608, 3127, 2037,  660,  806, 3309, 3747, 3642, 1688, 3382, 4229,
       3277, 1265, 3907, 2901,   71, 4407,  419,  949, 1803, 2921, 1290,
       4640, 3863, 2257, 1212, 4037, 1273, 3167, 4225, 2217,  953, 4425,
       3306,  106, 2934, 1711,  917, 2525, 3165], dtype=int64)

In [21]:
def export_res(cluster_keys, fname, fences_df):
    f = open(fname, 'w', encoding = 'utf-8')
    f.writelines('FENCE_ID | FENCE_TYPE | BELONG_AREA')
    f.write('\n')
    for i in range(len(fences_df)):
        fence_id = fences_df['FENCE_ID'][i]
        cluster = fences_df['CLUSTER'][i]
        if cluster in cluster_keys:
            is_crowded = 1
        else:
            is_crowded = 0
        string = fence_id + '|' + str(is_crowded) + '|' + str(cluster)
        f.writelines(string)
        f.write('\n')
    f.close()

In [22]:
def crowd_fence_position(cluster_keys, fences_df):
    pos = []
    for i in range(len(fences_df)):
        fence_id = fences_df['FENCE_ID'][i]
        cluster = fences_df['CLUSTER'][i]
        if cluster in cluster_keys:
            pos.append((fences_df['LATITUDE'][i], fences_df['LONGITUDE'][i]))
    return pos

In [23]:
sort_by_inflow_points = crowd_fence_position(top_40_inflow, fences)
print(len(sort_by_inflow_points))

563


In [24]:
m = folium.Map(location=[24.527352, 118.10321675], zoom_start=13, zoom_control='False',
               attr='AutoNavi')
for point in sort_by_inflow_points:
    folium.Marker(point).add_to(m)
m

In [25]:
sort_by_inflow_density_points = crowd_fence_position(top_40_inflow_density, fences)
print(len(sort_by_inflow_density_points))
m = folium.Map(location=[24.527352, 118.10321675], zoom_start=13, zoom_control='False',
               attr='AutoNavi')
for point in sort_by_inflow_density_points:
    folium.Marker(point).add_to(m)
m

54


In [26]:
sort_by_flow_points = crowd_fence_position(top_40_flow, fences)
print(len(sort_by_flow_points))
m = folium.Map(location=[24.527352, 118.10321675], zoom_start=13, zoom_control='False',
               attr='AutoNavi')
for point in sort_by_flow_points:
    folium.Marker(point).add_to(m)
m

236


In [27]:
sort_by_flow_density_points = crowd_fence_position(top_40_flow_density, fences)
print(len(sort_by_flow_density_points))
m = folium.Map(location=[24.527352, 118.10321675], zoom_start=13, zoom_control='False',
               attr='AutoNavi')
for point in sort_by_flow_density_points:
    folium.Marker(point).add_to(m)
m

61


In [28]:
# export_res(top_40_inflow,'../result.txt',fences)#eps=30m,分数=16.x

In [29]:
# export_res(top_40_flow,'../result.txt',fences) #eps= 20m,分数=19.x

In [30]:
# export_res(top_40_flow_density,'../result.txt',fences) #eps=20m