In [1]:
import os
import folium
import random
import collections
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import DBSCAN
from shapely.geometry import Polygon, Point

In [2]:
bikes = pd.read_csv('../cleaned_data/removed_abnormal.csv')
if 'Unnamed: 0' in bikes.columns:
    bikes.drop('Unnamed: 0', axis = 1)

In [3]:
inbikes = bikes[bikes['LOCK_STATUS'] == 1]
outbikes = bikes[bikes['LOCK_STATUS'] == 0]
fences = pd.read_csv('../cleaned_data/fence_position.csv')

In [4]:
fences.sample(5)

Unnamed: 0,FENCE_ID,LONGITUDE_0,LATITUDE_0,LONGITUDE_1,LATITUDE_1,LONGITUDE_2,LATITUDE_2,LONGITUDE_3,LATITUDE_3,LONGITUDE_4,LATITUDE_4,LATITUDE,LONGITUDE,ROAD,AREA,FID
1612,会展六路_L_3,118.174097,24.466396,118.174168,24.466368,118.174161,24.466354,118.17409,24.466382,118.174097,24.466396,24.466375,118.174129,会展六路,6.90624,1612
9730,环岛干道0_R_A06010,118.183812,24.492767,118.183793,24.492675,118.183777,24.492678,118.183796,24.492769,118.183812,24.492767,24.492722,118.183795,环岛干道,8.846467,9730
4330,嘉禾路0_L_A19001,118.120958,24.50259,118.12095,24.502641,118.120968,24.502643,118.120976,24.502593,118.120958,24.50259,24.502617,118.120963,嘉禾路,5.426096,4330
156,东坪山路0_R_2,118.115998,24.472426,118.115996,24.472386,118.115979,24.472386,118.11598,24.472427,118.115998,24.472426,24.472406,118.115988,东坪山路,4.172825,156
86,七星西路_R_3_A,118.091718,24.488389,118.091737,24.488312,118.091718,24.488308,118.091698,24.488384,118.091718,24.488389,24.488348,118.091718,七星西路,9.040364,86


In [5]:
random.seed(0)
def random_points_within(poly, num_points):
    min_x, min_y, max_x, max_y = poly.bounds

    points = []

    while len(points) < num_points:
        random_point = Point([random.uniform(min_x, max_x), random.uniform(min_y, max_y)])
        if (random_point.within(poly)):
            points.append(random_point)

    return points


fid_posdic = dict()
for i in range(len(fences)):
    area = fences['AREA'][i]
    p1, p2, p3, p4, p5 = (fences['LATITUDE_0'][i], fences['LONGITUDE_0'][i]), (fences['LATITUDE_1'][i], fences['LONGITUDE_1'][i]), (fences['LATITUDE_2'][i], fences['LONGITUDE_2'][i]), (fences['LATITUDE_3'][i], fences['LONGITUDE_3'][i]), (fences['LATITUDE_4'][i], fences['LONGITUDE_4'][i])
    base = [p1, p2, p3, p4]
    if area > 5:
        num_points = area // 1 + 1
        poly = Polygon([p1, p2, p3, p4, p5])
        points = random_points_within(poly, num_points)
        for p in points:
            base.append((p.x, p.y))
    fid_posdic[i] = base


In [6]:
fid_reverse_proj = collections.defaultdict(list)
x = set()
for k, v in fid_posdic.items():
    fence_id = k
    for pos in v:
        fid_reverse_proj[pos].append(k)
        x.add(pos)
x = list(x)

In [7]:
fence_cluster = DBSCAN(eps = 0.00023, min_samples = 1).fit_predict(x)

In [8]:
cluster_cnts = collections.Counter(fence_cluster)
point_cnts = sorted([(v, k) for k, v in cluster_cnts.items()], reverse = True)
print("Number of clusters")
print(str(len(cluster_cnts)))
print("Max/min number of points in clusters")
print(str(point_cnts[0]))
print(str(point_cnts[-1]))

Number of clusters
4022
Max/min number of points in clusters
(875, 0)
(4, 87)


In [9]:
fence_belong = dict()
for i in range(len(fence_cluster)):
    point = x[i]
    belong = fence_cluster[i]
    curfences = fid_reverse_proj[point]
    for f in curfences:
        if f in fence_belong and fence_belong[f] != belong:
            print("ERROR")
            print(fence_belong[f])
        fence_belong[f] = belong

In [10]:
clusters = []
for i in range(len(fences)):
    clusters.append(fence_belong[i])
fences['CLUSTER'] = clusters

In [11]:
cluster_area = fences.groupby('CLUSTER').agg({'AREA':sum})
cluster_area

Unnamed: 0_level_0,AREA
CLUSTER,Unnamed: 1_level_1
0,596.191865
1,8.842340
2,27.989271
3,103.705497
4,33.660985
...,...
4017,3.242683
4018,3.220162
4019,4.346306
4020,4.024273


In [12]:
in_outliers = inbikes[(inbikes['NEAREST_FENCE'] == -1) | (inbikes['DISTANCE'] > 25)]
out_outliers = outbikes[(outbikes['NEAREST_FENCE'] == -1)|(outbikes['DISTANCE'] > 25)]
inbikes = inbikes[(inbikes['NEAREST_FENCE'] != -1) & (inbikes['DISTANCE'] <= 25)]
outbikes = outbikes[(outbikes['NEAREST_FENCE'] != -1) & (outbikes['DISTANCE'] <= 25)]

In [13]:
bikes_in = pd.merge(inbikes,fences, left_on='NEAREST_FENCE', right_on='FID',how='left')
bikes_in.drop('NEAREST_FENCE', axis = 1, inplace = True)
bikes_out = pd.merge(outbikes,fences, left_on='NEAREST_FENCE', right_on='FID',how='left')
bikes_out.drop('NEAREST_FENCE', axis = 1, inplace = True)

In [14]:
in_flow = bikes_in.groupby('CLUSTER').agg({'CLUSTER':'count'})
in_flow

Unnamed: 0_level_0,CLUSTER
CLUSTER,Unnamed: 1_level_1
0,522
1,25
2,141
3,61
4,9
...,...
4015,9
4016,8
4018,17
4019,33


In [15]:
out_flow = bikes_out.groupby('CLUSTER').agg({'CLUSTER':'count'})

In [16]:
cluster_df = pd.DataFrame()
cluster_df['CLUSTER'] = cluster_area.index
cluster_df['AREA'] = cluster_area.values.ravel()
inflow = [0]*len(cluster_cnts)
for k in cluster_area.index:
    if k in in_flow.index:
        inflow[k] = in_flow.loc[k].values[0]
cluster_df['INFLOW'] = inflow
outflow = [0]*len(cluster_cnts)
for k in cluster_area.index:
    if k in out_flow.index:
        outflow[k] = out_flow.loc[k].values[0]
cluster_df['OUTFLOW'] = outflow
cluster_df['INFLOW_DENSITY'] = cluster_df.apply(lambda x: x['INFLOW']/x['AREA'], axis = 1)
cluster_df['FLOW'] = cluster_df.apply(lambda x: x['INFLOW'] - x['OUTFLOW'], axis = 1)
cluster_df['FLOW_DENSITY'] = cluster_df.apply(lambda x: (x['INFLOW'] - x['OUTFLOW'])/x['AREA'], axis = 1)

In [17]:
cluster_df.head(5)

Unnamed: 0,CLUSTER,AREA,INFLOW,OUTFLOW,INFLOW_DENSITY,FLOW,FLOW_DENSITY
0,0,596.191865,522,785,0.875557,-263.0,-0.441133
1,1,8.84234,25,15,2.827306,10.0,1.130922
2,2,27.989271,141,42,5.037645,99.0,3.53707
3,3,103.705497,61,115,0.588204,-54.0,-0.520705
4,4,33.660985,9,117,0.267372,-108.0,-3.208462


In [18]:
sort_by_inflow = cluster_df.sort_values("INFLOW", ascending = False)
top_40_inflow = sort_by_inflow['CLUSTER'].values[:40]
top_40_inflow

array([ 507,  197,  193,    8,  530, 1152,  860,   53,  285,  401,  828,
         23,  946,   60, 2641, 1244, 2811,   44,    0, 1706, 1088,  942,
         18,  546, 1841,  163, 2711,  195, 1086,  282,   64,  499,  221,
       1101, 1925,  403,  672, 2607, 2281,  642], dtype=int64)

In [19]:
sort_by_inflow_density = cluster_df.sort_values("INFLOW_DENSITY", ascending = False)
top_40_inflow_density = sort_by_inflow_density['CLUSTER'].values[:40]
top_40_inflow_density

array([2811,  163, 2659, 2641, 3254, 3225, 2607, 1152,   71, 3656, 2966,
       3270, 3205, 2042, 3659, 1086, 2797, 3284, 3193, 3541, 3713, 2254,
       2771, 1480,  435, 2772, 3349, 3903, 3833, 1612, 3373, 1789, 1963,
        350,   60, 1147, 2149, 1051, 2690, 2432], dtype=int64)

In [20]:
sort_by_flow = cluster_df.sort_values("FLOW", ascending = False)
top_40_flow = sort_by_flow['CLUSTER'].values[:40]
top_40_flow

array([ 507,  530,  193,  860, 1152,  285,   60,  946, 2811,   53, 1088,
        828,  546, 2641,  163, 2711, 1101, 1841,  401,  282,  672, 1074,
          8,  221, 2281,   44,   18, 1480, 2659, 1086,  350,   23, 3284,
       1706,  540, 1554, 1552, 2042,  316, 3254], dtype=int64)

In [21]:
sort_by_flow_density = cluster_df.sort_values("FLOW_DENSITY", ascending = False)
top_40_flow_density = sort_by_flow_density['CLUSTER'].values[:40]
top_40_flow_density

array([2811,  163, 2659, 3254, 2641, 1152, 3270, 3205, 3656, 3713, 1480,
       2042, 3284, 2797,  435,  350, 3903,   60, 3225, 2607, 2771, 3373,
       1963, 1147, 1086, 3002, 1612, 3647, 3543, 3505, 2690,   90, 2281,
       3419,   48, 1101, 2432, 3193, 3501, 1789], dtype=int64)

In [22]:
def export_res(cluster_keys, fname, fences_df):
    f = open(fname, 'w', encoding = 'utf-8')
    f.writelines('FENCE_ID|FENCE_TYPE|BELONG_AREA')
    f.write('\n')
    for i in range(len(fences_df)):
        fence_id = fences_df['FENCE_ID'][i]
        cluster = fences_df['CLUSTER'][i]
        if cluster in cluster_keys:
            is_crowded = 1
        else:
            is_crowded = 0
        string = fence_id + '|' + str(is_crowded) + '|' + str(cluster)
        f.writelines(string)
        f.write('\n')
    f.close()

In [23]:
def crowd_fence_position(cluster_keys, fences_df):
    pos = []
    for i in range(len(fences_df)):
        fence_id = fences_df['FENCE_ID'][i]
        cluster = fences_df['CLUSTER'][i]
        if cluster in cluster_keys:
            pos.append((fences_df['LATITUDE'][i], fences_df['LONGITUDE'][i]))
    return pos

In [24]:
sort_by_inflow_points = crowd_fence_position(top_40_inflow, fences)
print(len(sort_by_inflow_points))

843


In [25]:
bikes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 339810 entries, 0 to 339809
Data columns (total 11 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   Unnamed: 0     339810 non-null  int64  
 1   BID            339810 non-null  int64  
 2   BICYCLE_ID     339810 non-null  object 
 3   LATITUDE       339810 non-null  float64
 4   LONGITUDE      339810 non-null  float64
 5   LOCK_STATUS    339810 non-null  int64  
 6   UPDATE_TIME    339810 non-null  object 
 7   GRID           339810 non-null  object 
 8   NEAREST_FENCE  339810 non-null  int64  
 9   DISTANCE       339810 non-null  float64
 10  MKTIME         339810 non-null  float64
dtypes: float64(4), int64(4), object(3)
memory usage: 28.5+ MB


In [26]:
inbikes.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 140389 entries, 1 to 339808
Data columns (total 11 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   Unnamed: 0     140389 non-null  int64  
 1   BID            140389 non-null  int64  
 2   BICYCLE_ID     140389 non-null  object 
 3   LATITUDE       140389 non-null  float64
 4   LONGITUDE      140389 non-null  float64
 5   LOCK_STATUS    140389 non-null  int64  
 6   UPDATE_TIME    140389 non-null  object 
 7   GRID           140389 non-null  object 
 8   NEAREST_FENCE  140389 non-null  int64  
 9   DISTANCE       140389 non-null  float64
 10  MKTIME         140389 non-null  float64
dtypes: float64(4), int64(4), object(3)
memory usage: 12.9+ MB


In [27]:
outbikes.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 114876 entries, 0 to 339809
Data columns (total 11 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   Unnamed: 0     114876 non-null  int64  
 1   BID            114876 non-null  int64  
 2   BICYCLE_ID     114876 non-null  object 
 3   LATITUDE       114876 non-null  float64
 4   LONGITUDE      114876 non-null  float64
 5   LOCK_STATUS    114876 non-null  int64  
 6   UPDATE_TIME    114876 non-null  object 
 7   GRID           114876 non-null  object 
 8   NEAREST_FENCE  114876 non-null  int64  
 9   DISTANCE       114876 non-null  float64
 10  MKTIME         114876 non-null  float64
dtypes: float64(4), int64(4), object(3)
memory usage: 10.5+ MB


In [28]:
m = folium.Map(location=[24.527352, 118.10321675], zoom_start=13, zoom_control='False',
               attr='AutoNavi')
for point in sort_by_inflow_points:
    folium.Marker(point).add_to(m)
m

In [29]:
sort_by_inflow_density_points = crowd_fence_position(top_40_inflow_density, fences)
print(len(sort_by_inflow_density_points))
m = folium.Map(location=[24.527352, 118.10321675], zoom_start=13, zoom_control='False',
               attr='AutoNavi')
for point in sort_by_inflow_density_points:
    folium.Marker(point).add_to(m)
m

57


In [30]:
sort_by_flow_points = crowd_fence_position(top_40_flow, fences)
print(len(sort_by_flow_points))
m = folium.Map(location=[24.527352, 118.10321675], zoom_start=13, zoom_control='False',
               attr='AutoNavi')
for point in sort_by_flow_points:
    folium.Marker(point).add_to(m)
m

592


In [31]:
sort_by_flow_density_points = crowd_fence_position(top_40_flow_density, fences)
print(len(sort_by_flow_density_points))
m = folium.Map(location=[24.527352, 118.10321675], zoom_start=13, zoom_control='False',
               attr='AutoNavi')
for point in sort_by_flow_density_points:
    folium.Marker(point).add_to(m)
m

58


In [32]:
export_res(top_40_flow,'../result.txt',fences) 
#eps=22m, 50m外不算,得分19.6322
#eps=15m, 50m外不算,得分19.2868
#eps=20m,30m外不算,得分19.3282

In [33]:
f = pd.read_csv('../result.txt', sep = '|')

In [34]:
f.head()

Unnamed: 0,FENCE_ID,FENCE_TYPE,BELONG_AREA
0,长乐路0_L_A17001,0,484
1,长乐路0_L_A17002,0,484
2,长乐路0_L_A17003,0,484
3,长乐路0_L_A17004,0,484
4,长乐路0_L_A17005,0,484


In [35]:
sum(f['FENCE_TYPE'])

220